ref: 4b2c2b9aa4a273a23d90ddb3bbf6dfb3482e0b8f
parent: 6c280c2299f078a475dc87e7615fdf1a4998cd31
author: Ronald S. Bultje <rbultje@google.com>
date: Thu Nov 1 07:09:58 EDT 2012
Rename vp8/ codec directory to vp9/. Change-Id: Ic084c475844b24092a433ab88138cf58af3abbe4
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -7,9 +7,9 @@
REM be found in the AUTHORS file in the root of the source tree.
echo on
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/asm_enc_offsets.c"
obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
--- a/configure
+++ b/configure
@@ -31,7 +31,7 @@
${toggle_debug_libs} in/exclude debug version of libraries
${toggle_md5} support for output of checksum data
${toggle_static_msvcrt} use static MSVCRT (VS builds only)
- ${toggle_vp8} VP8 codec support
+ ${toggle_vp9} VP9 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
${toggle_mem_tracker} track memory usage
${toggle_postproc} postprocessing
@@ -161,17 +161,17 @@
enable os_support
[ -d ${source_path}/../include ] && enable alt_tree_layout
-for d in vp8; do
+for d in vp9; do
[ -d ${source_path}/${d} ] && disable alt_tree_layout;
done
if ! enabled alt_tree_layout; then
# development environment
-[ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
+[ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
else
# customer environment
-[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder"
-[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"
+[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder"
+[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder"
[ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt
fi
--- a/docs.mk
+++ b/docs.mk
@@ -21,7 +21,7 @@
usage_dx.dox \
# Other doxy files sourced in Markdown
-TXT_DOX-$(CONFIG_VP8) += vp8_api1_migration.dox
+TXT_DOX-$(CONFIG_VP9) += vp8_api1_migration.dox
vp8_api1_migration.dox.DESC = VP8 API 1.x Migration
TXT_DOX = $(call enabled,TXT_DOX)
--- a/example_xma.c
+++ b/example_xma.c
@@ -18,7 +18,7 @@
#include "vpx_config.h"
#include "vpx/vpx_decoder.h"
#include "vpx/vpx_integer.h"
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
#include "vpx/vp8dx.h"
#endif
@@ -29,8 +29,8 @@
const char *name;
const vpx_codec_iface_t *iface;
} ifaces[] = {
-#if CONFIG_VP8_DECODER
- {"vp8", &vpx_codec_vp8_dx_algo},
+#if CONFIG_VP9_DECODER
+ {"vp9", &vpx_codec_vp8_dx_algo},
#endif
};
--- a/examples.mk
+++ b/examples.mk
@@ -81,13 +81,13 @@
error_resilient.GUID = DF5837B9-4145-4F92-A031-44E4F832E00C
error_resilient.DESCRIPTION = Error Resiliency Feature
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_scalable_patterns.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_scalable_patterns.c
vp8_scalable_patterns.GUID = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
vp8_scalable_patterns.DESCRIPTION = VP8 Scalable Bitstream Patterns
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_set_maps.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_set_maps.c
vp8_set_maps.GUID = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
vp8_set_maps.DESCRIPTION = VP8 set active and ROI maps
-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8cx_set_ref.c
vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame
@@ -97,10 +97,10 @@
# We should not link to math library (libm) on RVCT
# when building for bare-metal targets
ifeq ($(CONFIG_OS_SUPPORT), yes)
-CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m
+CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m
else
ifeq ($(CONFIG_GCC), yes)
- CODEC_EXTRA_LIBS-$(CONFIG_VP8) += m
+ CODEC_EXTRA_LIBS-$(CONFIG_VP9) += m
endif
endif
#
@@ -117,8 +117,8 @@
INC_PATH := $(SRC_PATH_BARE)/../include
else
LIB_PATH-yes += $(if $(BUILD_PFX),$(BUILD_PFX),.)
- INC_PATH-$(CONFIG_VP8_DECODER) += $(SRC_PATH_BARE)/vp8
- INC_PATH-$(CONFIG_VP8_ENCODER) += $(SRC_PATH_BARE)/vp8
+ INC_PATH-$(CONFIG_VP9_DECODER) += $(SRC_PATH_BARE)/vp9
+ INC_PATH-$(CONFIG_VP9_ENCODER) += $(SRC_PATH_BARE)/vp9
LIB_PATH := $(call enabled,LIB_PATH)
INC_PATH := $(call enabled,INC_PATH)
endif
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt
@@ -1,7 +1,7 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES
#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vpx_decoder.h"
-#include "vpx/vp8dx.h"
+#include "vpx/vp9dx.h"
#define interface (vpx_codec_vp8_dx())
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt
@@ -1,7 +1,7 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES
#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
+#include "vpx/vp9cx.h"
#define interface (vpx_codec_vp8_cx())
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES
--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -51,7 +51,7 @@
postprocessors. VP8 is one example. The following sample code toggles
postprocessing on and off every 15 frames.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
if(frame_cnt%30 == 1) {
vp8_postproc_cfg_t pp = {0, 0, 0};
--- a/libs.mk
+++ b/libs.mk
@@ -30,29 +30,29 @@
CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
-ifeq ($(CONFIG_VP8_ENCODER),yes)
- VP8_PREFIX=vp8/
- include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
- CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
- CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
- CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
- CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+ VP9_PREFIX=vp9/
+ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
+ CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+ CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
+ CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
+ CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
- INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+ INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
- CODEC_DOC_SECTIONS += vp8 vp8_encoder
+ CODEC_DOC_SECTIONS += vp9 vp9_encoder
endif
-ifeq ($(CONFIG_VP8_DECODER),yes)
- VP8_PREFIX=vp8/
- include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
- CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
- CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
- CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
+ifeq ($(CONFIG_VP9_DECODER),yes)
+ VP9_PREFIX=vp9/
+ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
+ CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))
+ CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))
+ CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
- INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+ INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
- CODEC_DOC_SECTIONS += vp8 vp8_decoder
+ CODEC_DOC_SECTIONS += vp9 vp9_decoder
endif
@@ -305,46 +305,46 @@
OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
- $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+ $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
@echo " [CREATE] $@"
$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
- $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
- CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+ $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c
+ CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S
- $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+ $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
@echo " [CREATE] $@"
$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
- $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
- CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+ $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c
+ CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S
- $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+ $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
@echo " [CREATE] $@"
$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
- $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
- CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+ $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c
+ CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S
else
ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
asm_com_offsets.asm: obj_int_extract
- asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+ asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o
@echo " [CREATE] $@"
$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+ OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o
CLEAN-OBJS += asm_com_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
asm_enc_offsets.asm: obj_int_extract
- asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+ asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
@echo " [CREATE] $@"
$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+ OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o
CLEAN-OBJS += asm_enc_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
asm_dec_offsets.asm: obj_int_extract
- asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+ asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
@echo " [CREATE] $@"
$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+ OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o
CLEAN-OBJS += asm_dec_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
endif
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@@ -15,8 +15,8 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
extern "C" {
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/decoder/dboolhuff.h"
+#include "vp9/encoder/boolhuff.h"
+#include "vp9/decoder/dboolhuff.h"
}
#include "acm_random.h"
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -15,9 +15,9 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
extern "C" {
-#include "vp8/common/entropy.h"
-#include "vp8/common/idct.h"
-#include "vp8/encoder/dct.h"
+#include "vp9/common/entropy.h"
+#include "vp9/common/idct.h"
+#include "vp9/encoder/dct.h"
}
#include "acm_random.h"
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,8 +15,8 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
extern "C" {
-#include "vp8/common/idct.h"
-#include "vp8/encoder/dct.h"
+#include "vp9/common/idct.h"
+#include "vp9/encoder/dct.h"
}
#include "acm_random.h"
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -15,8 +15,8 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
extern "C" {
-#include "vp8/encoder/dct.h"
-#include "vp8/common/idct.h"
+#include "vp9/encoder/dct.h"
+#include "vp9/common/idct.h"
}
#include "acm_random.h"
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,8 +15,8 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
extern "C" {
-#include "vp8/encoder/dct.h"
-#include "vp8/common/idct.h"
+#include "vp9/encoder/dct.h"
+#include "vp9/common/idct.h"
}
#include "acm_random.h"
--- a/vp8/common/alloccommon.c
+++ /dev/null
@@ -1,220 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "blockd.h"
-#include "vpx_mem/vpx_mem.h"
-#include "onyxc_int.h"
-#include "findnearmv.h"
-#include "entropymode.h"
-#include "entropymv.h"
-#include "systemdependent.h"
-
-
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {
- int stride = cpi->mode_info_stride;
- int i;
-
- // Clear down top border row
- vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);
-
- // Clear left border column
- for (i = 1; i < cpi->mb_rows + 1; i++) {
- vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));
- }
-}
-
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {
- int i, j;
-
- // For each in image mode_info element set the in image flag to 1
- for (i = 0; i < cpi->mb_rows; i++) {
- for (j = 0; j < cpi->mb_cols; j++) {
- mi->mbmi.mb_in_image = 1;
- mi++; // Next element in the row
- }
-
- mi++; // Step over border element at start of next row
- }
-}
-
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
- int i;
-
- for (i = 0; i < NUM_YV12_BUFFERS; i++)
- vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
-
- vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
- vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
-
- vpx_free(oci->above_context);
- vpx_free(oci->mip);
- vpx_free(oci->prev_mip);
-
- oci->above_context = 0;
- oci->mip = 0;
- oci->prev_mip = 0;
-
-}
-
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
- int i;
-
- vp9_de_alloc_frame_buffers(oci);
-
- /* our internal buffers are always multiples of 16 */
- if ((width & 0xf) != 0)
- width += 16 - (width & 0xf);
-
- if ((height & 0xf) != 0)
- height += 16 - (height & 0xf);
-
-
- for (i = 0; i < NUM_YV12_BUFFERS; i++) {
- oci->fb_idx_ref_cnt[i] = 0;
- oci->yv12_fb[i].flags = 0;
- if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
- }
-
- oci->new_fb_idx = 0;
- oci->lst_fb_idx = 1;
- oci->gld_fb_idx = 2;
- oci->alt_fb_idx = 3;
-
- oci->fb_idx_ref_cnt[0] = 1;
- oci->fb_idx_ref_cnt[1] = 1;
- oci->fb_idx_ref_cnt[2] = 1;
- oci->fb_idx_ref_cnt[3] = 1;
-
- if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
-
- if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
-
- oci->mb_rows = height >> 4;
- oci->mb_cols = width >> 4;
- oci->MBs = oci->mb_rows * oci->mb_cols;
- oci->mode_info_stride = oci->mb_cols + 1;
- oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
-
- if (!oci->mip) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
-
- oci->mi = oci->mip + oci->mode_info_stride + 1;
-
- /* allocate memory for last frame MODE_INFO array */
-
- oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
-
- if (!oci->prev_mip) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
-
- oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
-
- oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
-
- if (!oci->above_context) {
- vp9_de_alloc_frame_buffers(oci);
- return 1;
- }
-
- vp9_update_mode_info_border(oci, oci->mip);
- vp9_update_mode_info_in_image(oci, oci->mi);
-
- return 0;
-}
-void vp9_setup_version(VP9_COMMON *cm) {
- if (cm->version & 0x4) {
- if (!CONFIG_EXPERIMENTAL)
- vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
- "Bitstream was created by an experimental "
- "encoder");
- cm->experimental = 1;
- }
-
- switch (cm->version & 0x3) {
- case 0:
- cm->no_lpf = 0;
- cm->filter_type = NORMAL_LOOPFILTER;
- cm->use_bilinear_mc_filter = 0;
- cm->full_pixel = 0;
- break;
- case 1:
- cm->no_lpf = 0;
- cm->filter_type = SIMPLE_LOOPFILTER;
- cm->use_bilinear_mc_filter = 1;
- cm->full_pixel = 0;
- break;
- case 2:
- case 3:
- cm->no_lpf = 1;
- cm->filter_type = NORMAL_LOOPFILTER;
- cm->use_bilinear_mc_filter = 1;
- cm->full_pixel = 0;
- break;
- // Full pel only code deprecated in experimental code base
- // case 3:
- // cm->no_lpf = 1;
- // cm->filter_type = SIMPLE_LOOPFILTER;
- // cm->use_bilinear_mc_filter = 1;
- // cm->full_pixel = 1;
- // break;
- }
-}
-void vp9_create_common(VP9_COMMON *oci) {
- vp9_machine_specific_config(oci);
-
- vp9_init_mbmode_probs(oci);
-
- vp9_default_bmode_probs(oci->fc.bmode_prob);
-
- oci->txfm_mode = ONLY_4X4;
- oci->mb_no_coeff_skip = 1;
- oci->comp_pred_mode = HYBRID_PREDICTION;
- oci->no_lpf = 0;
- oci->filter_type = NORMAL_LOOPFILTER;
- oci->use_bilinear_mc_filter = 0;
- oci->full_pixel = 0;
- oci->clr_type = REG_YUV;
- oci->clamp_type = RECON_CLAMP_REQUIRED;
-
- /* Initialise reference frame sign bias structure to defaults */
- vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
-
- /* Default disable buffer to buffer copying */
- oci->copy_buffer_to_gf = 0;
- oci->copy_buffer_to_arf = 0;
- oci->kf_ymode_probs_update = 0;
-}
-
-void vp9_remove_common(VP9_COMMON *oci) {
- vp9_de_alloc_frame_buffers(oci);
-}
-
-void vp9_initialize_common() {
- vp9_coef_tree_initialize();
-
- vp9_entropy_mode_init();
-
- vp9_entropy_mv_init();
-}
--- a/vp8/common/alloccommon.h
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ALLOCCOMMON_H
-#define __INC_ALLOCCOMMON_H
-
-#include "onyxc_int.h"
-
-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_setup_version(VP9_COMMON *oci);
-
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
-
-#endif
--- a/vp8/common/arm/arm_systemdependent.c
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
- VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
- int flags = arm_cpu_caps();
- rtcd->flags = flags;
-
- /* Override default functions with fastest ones for this CPU. */
-#if HAVE_ARMV5TE
- if (flags & HAS_EDSP) {
- }
-#endif
-
-// The commented functions need to be re-written for vpx.
-#if HAVE_ARMV6
- if (flags & HAS_MEDIA) {
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6;
- rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6;
-
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6;
- rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6;
- rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6;
-
- // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6;
- // rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual;
- // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6;
- // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6;
-
- rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6;
- rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6;
- rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6;
- rtcd->recon.recon = vp9_recon_b_armv6;
- rtcd->recon.recon2 = vp9_recon2b_armv6;
- rtcd->recon.recon4 = vp9_recon4b_armv6;
- }
-#endif
-
-#if HAVE_ARMV7
- if (flags & HAS_NEON) {
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon;
- rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon;
-
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon;
- rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon;
- rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon;
-
- // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon;
- // rtcd->idct.idct16 = vp9_short_idct4x4llm_neon;
- // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon;
- // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon;
-
- rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon;
- rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon;
- rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon;
- rtcd->recon.recon = vp9_recon_b_neon;
- rtcd->recon.recon2 = vp9_recon2b_neon;
- rtcd->recon.recon4 = vp9_recon4b_neon;
- rtcd->recon.recon_mb = vp9_recon_mb_neon;
- rtcd->recon.build_intra_predictors_mby =
- vp9_build_intra_predictors_mby_neon;
- rtcd->recon.build_intra_predictors_mby_s =
- vp9_build_intra_predictors_mby_s_neon;
- }
-#endif
-
-#endif
-}
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ /dev/null
@@ -1,237 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_filter_block2d_bil_first_pass_armv6|
- EXPORT |vp9_filter_block2d_bil_second_pass_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;-------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 unsigned short *dst_ptr,
-; r2 unsigned int src_pitch,
-; r3 unsigned int height,
-; stack unsigned int width,
-; stack const short *vp9_filter
-;-------------------------------------
-; The output is transposed stroed in output array to make it easy for second pass filtering.
-|vp9_filter_block2d_bil_first_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp9_filter address
- ldr r4, [sp, #36] ; width
-
- mov r12, r3 ; outer-loop counter
-
- add r7, r2, r4 ; preload next row
- pld [r0, r7]
-
- sub r2, r2, r4 ; src increment for height loop
-
- ldr r5, [r11] ; load up filter coefficients
-
- mov r3, r3, lsl #1 ; height*2
- add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
-
- mov r11, r1 ; save dst_ptr for each row
-
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_1st_filter
-
-|bil_height_loop_1st_v6|
- ldrb r6, [r0] ; load source data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- mov lr, r4, lsr #2 ; 4-in-parellel loop counter
-
-|bil_width_loop_1st_v6|
- ldrb r9, [r0, #3]
- ldrb r10, [r0, #4]
-
- pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
- pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
-
- smuad r6, r6, r5 ; apply the filter
- pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
- smuad r7, r7, r5
- pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
-
- smuad r8, r8, r5
- smuad r9, r9, r5
-
- add r0, r0, #4
- subs lr, lr, #1
-
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #16, r6, asr #7
- usat r7, #16, r7, asr #7
-
- strh r6, [r1], r3 ; result is transposed and stored
-
- add r8, r8, #0x40 ; round_shift_and_clamp
- strh r7, [r1], r3
- add r9, r9, #0x40
- usat r8, #16, r8, asr #7
- usat r9, #16, r9, asr #7
-
- strh r8, [r1], r3 ; result is transposed and stored
-
- ldrneb r6, [r0] ; load source data
- strh r9, [r1], r3
-
- ldrneb r7, [r0, #1]
- ldrneb r8, [r0, #2]
-
- bne bil_width_loop_1st_v6
-
- add r0, r0, r2 ; move to next input row
- subs r12, r12, #1
-
- add r9, r2, r4, lsl #1 ; adding back block width
- pld [r0, r9] ; preload next row
-
- add r11, r11, #2 ; move over to next column
- mov r1, r11
-
- bne bil_height_loop_1st_v6
-
- ldmia sp!, {r4 - r11, pc}
-
-|bil_null_1st_filter|
-|bil_height_loop_null_1st|
- mov lr, r4, lsr #2 ; loop counter
-
-|bil_width_loop_null_1st|
- ldrb r6, [r0] ; load data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- ldrb r9, [r0, #3]
-
- strh r6, [r1], r3 ; store it to immediate buffer
- add r0, r0, #4
- strh r7, [r1], r3
- subs lr, lr, #1
- strh r8, [r1], r3
- strh r9, [r1], r3
-
- bne bil_width_loop_null_1st
-
- subs r12, r12, #1
- add r0, r0, r2 ; move to next input line
- add r11, r11, #2 ; move over to next column
- mov r1, r11
-
- bne bil_height_loop_null_1st
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP ; |vp9_filter_block2d_bil_first_pass_armv6|
-
-
-;---------------------------------
-; r0 unsigned short *src_ptr,
-; r1 unsigned char *dst_ptr,
-; r2 int dst_pitch,
-; r3 unsigned int height,
-; stack unsigned int width,
-; stack const short *vp9_filter
-;---------------------------------
-|vp9_filter_block2d_bil_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp9_filter address
- ldr r4, [sp, #36] ; width
-
- ldr r5, [r11] ; load up filter coefficients
- mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
- mov r11, r1
-
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_2nd_filter
-
-|bil_height_loop_2nd|
- ldr r6, [r0] ; load the data
- ldr r8, [r0, #4]
- ldrh r10, [r0, #8]
- mov lr, r3, lsr #2 ; loop counter
-
-|bil_width_loop_2nd|
- pkhtb r7, r6, r8 ; src[1] | src[2]
- pkhtb r9, r8, r10 ; src[3] | src[4]
-
- smuad r6, r6, r5 ; apply filter
- smuad r8, r8, r5 ; apply filter
-
- subs lr, lr, #1
-
- smuadx r7, r7, r5 ; apply filter
- smuadx r9, r9, r5 ; apply filter
-
- add r0, r0, #8
-
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #8, r6, asr #7
- usat r7, #8, r7, asr #7
- strb r6, [r1], r2 ; the result is transposed back and stored
-
- add r8, r8, #0x40 ; round_shift_and_clamp
- strb r7, [r1], r2
- add r9, r9, #0x40
- usat r8, #8, r8, asr #7
- usat r9, #8, r9, asr #7
- strb r8, [r1], r2 ; the result is transposed back and stored
-
- ldrne r6, [r0] ; load data
- strb r9, [r1], r2
- ldrne r8, [r0, #4]
- ldrneh r10, [r0, #8]
-
- bne bil_width_loop_2nd
-
- subs r12, r12, #1
- add r0, r0, #4 ; update src for next row
- add r11, r11, #1
- mov r1, r11
-
- bne bil_height_loop_2nd
- ldmia sp!, {r4 - r11, pc}
-
-|bil_null_2nd_filter|
-|bil_height_loop_null_2nd|
- mov lr, r3, lsr #2
-
-|bil_width_loop_null_2nd|
- ldr r6, [r0], #4 ; load data
- subs lr, lr, #1
- ldr r8, [r0], #4
-
- strb r6, [r1], r2 ; store data
- mov r7, r6, lsr #16
- strb r7, [r1], r2
- mov r9, r8, lsr #16
- strb r8, [r1], r2
- strb r9, [r1], r2
-
- bne bil_width_loop_null_2nd
-
- subs r12, r12, #1
- add r0, r0, #4
- add r11, r11, #1
- mov r1, r11
-
- bne bil_height_loop_null_2nd
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_filter_block2d_second_pass_armv6|
-
- END
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ /dev/null
@@ -1,186 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem16x16_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem16x16_v6| PROC
- stmdb sp!, {r4 - r7}
- ;push {r4-r7}
-
- ;preload
- pld [r0, #31] ; preload for next 16x16 block
-
- ands r4, r0, #15
- beq copy_mem16x16_fast
-
- ands r4, r0, #7
- beq copy_mem16x16_8
-
- ands r4, r0, #3
- beq copy_mem16x16_4
-
- ;copy one byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
- ldrb r6, [r0, #2]
- ldrb r7, [r0, #3]
-
- mov r12, #16
-
-copy_mem16x16_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
- strb r6, [r2, #2]
- strb r7, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
- ldrb r6, [r0, #6]
- ldrb r7, [r0, #7]
-
- subs r12, r12, #1
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
- strb r6, [r2, #6]
- strb r7, [r2, #7]
-
- ldrb r4, [r0, #8]
- ldrb r5, [r0, #9]
- ldrb r6, [r0, #10]
- ldrb r7, [r0, #11]
-
- strb r4, [r2, #8]
- strb r5, [r2, #9]
- strb r6, [r2, #10]
- strb r7, [r2, #11]
-
- ldrb r4, [r0, #12]
- ldrb r5, [r0, #13]
- ldrb r6, [r0, #14]
- ldrb r7, [r0, #15]
-
- add r0, r0, r1
-
- strb r4, [r2, #12]
- strb r5, [r2, #13]
- strb r6, [r2, #14]
- strb r7, [r2, #15]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
- ldrneb r6, [r0, #2]
- ldrneb r7, [r0, #3]
-
- pld [r0, #31] ; preload for next 16x16 block
-
- bne copy_mem16x16_1_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem16x16_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
- ldr r6, [r0, #8]
- ldr r7, [r0, #12]
-
- mov r12, #16
-
-copy_mem16x16_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
- str r6, [r2, #8]
- str r7, [r2, #12]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
- ldrne r6, [r0, #8]
- ldrne r7, [r0, #12]
-
- pld [r0, #31] ; preload for next 16x16 block
-
- bne copy_mem16x16_4_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem16x16_8
- sub r1, r1, #16
- sub r3, r3, #16
-
- mov r12, #16
-
-copy_mem16x16_8_loop
- ldmia r0!, {r4-r5}
- ;ldm r0, {r4-r5}
- ldmia r0!, {r6-r7}
-
- add r0, r0, r1
-
- stmia r2!, {r4-r5}
- subs r12, r12, #1
- ;stm r2, {r4-r5}
- stmia r2!, {r6-r7}
-
- add r2, r2, r3
-
- pld [r0, #31] ; preload for next 16x16 block
- bne copy_mem16x16_8_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
-;copy 16 bytes each time
-copy_mem16x16_fast
- ;sub r1, r1, #16
- ;sub r3, r3, #16
-
- mov r12, #16
-
-copy_mem16x16_fast_loop
- ldmia r0, {r4-r7}
- ;ldm r0, {r4-r7}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r7}
- ;stm r2, {r4-r7}
- add r2, r2, r3
-
- pld [r0, #31] ; preload for next 16x16 block
- bne copy_mem16x16_fast_loop
-
- ldmia sp!, {r4 - r7}
- ;pop {r4-r7}
- mov pc, lr
-
- ENDP ; |vp9_copy_mem16x16_v6|
-
- END
--- a/vp8/common/arm/armv6/copymem8x4_v6.asm
+++ /dev/null
@@ -1,128 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem8x4_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x4_v6| PROC
- ;push {r4-r5}
- stmdb sp!, {r4-r5}
-
- ;preload
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- ands r4, r0, #7
- beq copy_mem8x4_fast
-
- ands r4, r0, #3
- beq copy_mem8x4_4
-
- ;copy 1 byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
-
- mov r12, #4
-
-copy_mem8x4_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
-
- ldrb r4, [r0, #2]
- ldrb r5, [r0, #3]
-
- subs r12, r12, #1
-
- strb r4, [r2, #2]
- strb r5, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
-
- ldrb r4, [r0, #6]
- ldrb r5, [r0, #7]
-
- add r0, r0, r1
-
- strb r4, [r2, #6]
- strb r5, [r2, #7]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
-
- bne copy_mem8x4_1_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem8x4_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
-
- mov r12, #4
-
-copy_mem8x4_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
-
- bne copy_mem8x4_4_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem8x4_fast
- ;sub r1, r1, #8
- ;sub r3, r3, #8
-
- mov r12, #4
-
-copy_mem8x4_fast_loop
- ldmia r0, {r4-r5}
- ;ldm r0, {r4-r5}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r5}
- ;stm r2, {r4-r5}
- add r2, r2, r3
-
- bne copy_mem8x4_fast_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
- ENDP ; |vp9_copy_mem8x4_v6|
-
- END
--- a/vp8/common/arm/armv6/copymem8x8_v6.asm
+++ /dev/null
@@ -1,128 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem8x8_v6|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x8_v6| PROC
- ;push {r4-r5}
- stmdb sp!, {r4-r5}
-
- ;preload
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- ands r4, r0, #7
- beq copy_mem8x8_fast
-
- ands r4, r0, #3
- beq copy_mem8x8_4
-
- ;copy 1 byte each time
- ldrb r4, [r0]
- ldrb r5, [r0, #1]
-
- mov r12, #8
-
-copy_mem8x8_1_loop
- strb r4, [r2]
- strb r5, [r2, #1]
-
- ldrb r4, [r0, #2]
- ldrb r5, [r0, #3]
-
- subs r12, r12, #1
-
- strb r4, [r2, #2]
- strb r5, [r2, #3]
-
- ldrb r4, [r0, #4]
- ldrb r5, [r0, #5]
-
- strb r4, [r2, #4]
- strb r5, [r2, #5]
-
- ldrb r4, [r0, #6]
- ldrb r5, [r0, #7]
-
- add r0, r0, r1
-
- strb r4, [r2, #6]
- strb r5, [r2, #7]
-
- add r2, r2, r3
-
- ldrneb r4, [r0]
- ldrneb r5, [r0, #1]
-
- bne copy_mem8x8_1_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 4 bytes each time
-copy_mem8x8_4
- ldr r4, [r0]
- ldr r5, [r0, #4]
-
- mov r12, #8
-
-copy_mem8x8_4_loop
- subs r12, r12, #1
- add r0, r0, r1
-
- str r4, [r2]
- str r5, [r2, #4]
-
- add r2, r2, r3
-
- ldrne r4, [r0]
- ldrne r5, [r0, #4]
-
- bne copy_mem8x8_4_loop
-
- ldmia sp!, {r4 - r5}
- ;pop {r4-r5}
- mov pc, lr
-
-;copy 8 bytes each time
-copy_mem8x8_fast
- ;sub r1, r1, #8
- ;sub r3, r3, #8
-
- mov r12, #8
-
-copy_mem8x8_fast_loop
- ldmia r0, {r4-r5}
- ;ldm r0, {r4-r5}
- add r0, r0, r1
-
- subs r12, r12, #1
- stmia r2, {r4-r5}
- ;stm r2, {r4-r5}
- add r2, r2, r3
-
- bne copy_mem8x8_fast_loop
-
- ldmia sp!, {r4-r5}
- ;pop {r4-r5}
- mov pc, lr
-
- ENDP ; |vp9_copy_mem8x8_v6|
-
- END
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ /dev/null
@@ -1,67 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
- EXPORT |vp8_dc_only_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-
-;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
-; unsigned char *dst_ptr, int pitch, int stride)
-; r0 input_dc
-; r1 pred_ptr
-; r2 dest_ptr
-; r3 pitch
-; sp stride
-
-|vp8_dc_only_idct_add_v6| PROC
- stmdb sp!, {r4 - r7, lr}
-
- add r0, r0, #4 ; input_dc += 4
- ldr r12, c0x0000FFFF
- ldr r4, [r1], r3
- ldr r6, [r1], r3
- and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
- ldr lr, [sp, #20]
- orr r0, r0, r0, lsl #16 ; a1 | a1
-
- uxtab16 r5, r0, r4 ; a1+2 | a1+0
- uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
- uxtab16 r7, r0, r6
- uxtab16 r6, r0, r6, ror #8
- usat16 r5, #8, r5
- usat16 r4, #8, r4
- usat16 r7, #8, r7
- usat16 r6, #8, r6
- orr r5, r5, r4, lsl #8
- orr r7, r7, r6, lsl #8
- ldr r4, [r1], r3
- ldr r6, [r1]
- str r5, [r2], lr
- str r7, [r2], lr
-
- uxtab16 r5, r0, r4
- uxtab16 r4, r0, r4, ror #8
- uxtab16 r7, r0, r6
- uxtab16 r6, r0, r6, ror #8
- usat16 r5, #8, r5
- usat16 r4, #8, r4
- usat16 r7, #8, r7
- usat16 r6, #8, r6
- orr r5, r5, r4, lsl #8
- orr r7, r7, r6, lsl #8
- str r5, [r2], lr
- str r7, [r2]
-
- ldmia sp!, {r4 - r7, pc}
-
- ENDP ; |vp8_dc_only_idct_add_v6|
-
-; Constant Pool
-c0x0000FFFF DCD 0x0000FFFF
- END
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ /dev/null
@@ -1,624 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_filter_block2d_first_pass_armv6|
- EXPORT |vp9_filter_block2d_first_pass_16x16_armv6|
- EXPORT |vp9_filter_block2d_first_pass_8x8_armv6|
- EXPORT |vp9_filter_block2d_second_pass_armv6|
- EXPORT |vp9_filter4_block2d_second_pass_armv6|
- EXPORT |vp9_filter_block2d_first_pass_only_armv6|
- EXPORT |vp9_filter_block2d_second_pass_only_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------------
-; r0 unsigned char *src_ptr
-; r1 short *output_ptr
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int output_width
-; stack unsigned int output_height
-; stack const short *vp9_filter
-;-------------------------------------
-; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with
-; the output being a 2 byte value and the intput being a 1 byte value.
-|vp9_filter_block2d_first_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp9_filter address
- ldr r7, [sp, #36] ; output height
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-; --------------------------
-; 16x16 version
-; -----------------------------
-|vp9_filter_block2d_first_pass_16x16_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp9_filter address
- ldr r7, [sp, #36] ; output height
-
- add r4, r2, #18 ; preload next low
- pld [r0, r4]
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_16_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_16_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_16_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r11, r2, #34 ; adding back block width(=16)
- pld [r0, r11] ; preload next low
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_16_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-; --------------------------
-; 8x8 version
-; -----------------------------
-|vp9_filter_block2d_first_pass_8x8_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; vp9_filter address
- ldr r7, [sp, #36] ; output height
-
- add r4, r2, #10 ; preload next low
- pld [r0, r4]
-
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
- add r12, r3, #16 ; square off the output
- sub sp, sp, #4
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r1, [sp] ; push destination to stack
- mov r7, r7, lsl #16 ; height is top part of counter
-
-; six tap filter
-|height_loop_1st_8_6|
- ldrb r8, [r0, #-2] ; load source data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
- orr r7, r7, r3, lsr #2 ; construct loop counter
-
-|width_loop_1st_8_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
- smuad lr, lr, r4 ; apply the filter
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- sub r7, r7, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r11, r10, r6, r8
-
- ands r10, r7, #0xff ; test loop counter
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
- add r11, r11, #0x40
- ldrneb r9, [r0, #-1]
- usat r11, #8, r11, asr #7
-
- strh lr, [r1], r12 ; result is transposed and stored, which
- ; will make second pass filtering easier.
- ldrneb r10, [r0], #2
- strh r11, [r1], r12
-
- bne width_loop_1st_8_6
-
- ldr r1, [sp] ; load and update dst address
- subs r7, r7, #0x10000
- add r0, r0, r2 ; move to next input line
-
- add r11, r2, #18 ; adding back block width(=8)
- pld [r0, r11] ; preload next low
-
- add r1, r1, #2 ; move over to next column
- str r1, [sp]
-
- bne height_loop_1st_8_6
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;---------------------------------
-; r0 short *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int output_pitch,
-; r3 unsigned int cnt,
-; stack const short *vp9_filter
-;---------------------------------
-|vp9_filter_block2d_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #36] ; vp9_filter address
- sub sp, sp, #4
- mov r7, r3, lsl #16 ; height is top part of counter
- str r1, [sp] ; push destination to stack
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- pkhbt r12, r5, r4 ; pack the filter differently
- pkhbt r11, r6, r5
-
- sub r0, r0, #4 ; offset input buffer
-
-|height_loop_2nd|
- ldr r8, [r0] ; load the data
- ldr r9, [r0, #4]
- orr r7, r7, r3, lsr #1 ; loop counter
-
-|width_loop_2nd|
- smuad lr, r4, r8 ; apply filter
- sub r7, r7, #1
- smulbt r8, r4, r8
-
- ldr r10, [r0, #8]
-
- smlad lr, r5, r9, lr
- smladx r8, r12, r9, r8
-
- ldrh r9, [r0, #12]
-
- smlad lr, r6, r10, lr
- smladx r8, r11, r10, r8
-
- add r0, r0, #4
- smlatb r10, r6, r9, r8
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ands r8, r7, #0xff
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r1], r2 ; the result is transposed back and stored
- usat r10, #8, r10, asr #7
-
- ldrne r8, [r0] ; load data for next loop
- ldrne r9, [r0, #4]
- strb r10, [r1], r2
-
- bne width_loop_2nd
-
- ldr r1, [sp] ; update dst for next loop
- subs r7, r7, #0x10000
- add r0, r0, #16 ; updata src for next loop
- add r1, r1, #1
- str r1, [sp]
-
- bne height_loop_2nd
-
- add sp, sp, #4
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;---------------------------------
-; r0 short *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int output_pitch,
-; r3 unsigned int cnt,
-; stack const short *vp9_filter
-;---------------------------------
-|vp9_filter4_block2d_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #36] ; vp9_filter address
- mov r7, r3, lsl #16 ; height is top part of counter
-
- ldr r4, [r11] ; load up packed filter coefficients
- add lr, r1, r3 ; save final destination pointer
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- pkhbt r12, r5, r4 ; pack the filter differently
- pkhbt r11, r6, r5
- mov r4, #0x40 ; rounding factor (for smlad{x})
-
-|height_loop_2nd_4|
- ldrd r8, [r0, #-4] ; load the data
- orr r7, r7, r3, lsr #1 ; loop counter
-
-|width_loop_2nd_4|
- ldr r10, [r0, #4]!
- smladx r6, r9, r12, r4 ; apply filter
- pkhbt r8, r9, r8
- smlad r5, r8, r12, r4
- pkhbt r8, r10, r9
- smladx r6, r10, r11, r6
- sub r7, r7, #1
- smlad r5, r8, r11, r5
-
- mov r8, r9 ; shift the data for the next loop
- mov r9, r10
-
- usat r6, #8, r6, asr #7 ; shift and clamp
- usat r5, #8, r5, asr #7
-
- strb r5, [r1], r2 ; the result is transposed back and stored
- tst r7, #0xff
- strb r6, [r1], r2
-
- bne width_loop_2nd_4
-
- subs r7, r7, #0x10000
- add r0, r0, #16 ; update src for next loop
- sub r1, lr, r7, lsr #16 ; update dst for next loop
-
- bne height_loop_2nd_4
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;------------------------------------
-; r0 unsigned char *src_ptr
-; r1 unsigned char *output_ptr,
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp9_filter
-;------------------------------------
-|vp9_filter_block2d_first_pass_only_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- add r7, r2, r3 ; preload next low
- add r7, r7, #2
- pld [r0, r7]
-
- ldr r4, [sp, #36] ; output pitch
- ldr r11, [sp, #40] ; HFilter address
- sub sp, sp, #8
-
- mov r7, r3
- sub r2, r2, r3 ; inside loop increments input array,
- ; so the height loop only needs to add
- ; r2 - width to the input pointer
-
- sub r4, r4, r3
- str r4, [sp] ; save modified output pitch
- str r2, [sp, #4]
-
- mov r2, #0x40
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
-; six tap filter
-|height_loop_1st_only_6|
- ldrb r8, [r0, #-2] ; load data
- ldrb r9, [r0, #-1]
- ldrb r10, [r0], #2
-
- mov r12, r3, lsr #1 ; loop counter
-
-|width_loop_1st_only_6|
- ldrb r11, [r0, #-1]
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0]
-
-;; smuad lr, lr, r4
- smlad lr, lr, r4, r2
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-;; smuad r8, r8, r4
- smlad r8, r8, r4, r2
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0, #1]
- smlad r8, r11, r5, r8
- ldrb r11, [r0, #2]
-
- subs r12, r12, #1
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r10, r10, r6, r8
-
-;; add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0, #-2] ; load data for next loop
- usat lr, #8, lr, asr #7
-;; add r10, r10, #0x40
- strb lr, [r1], #1 ; store the result
- usat r10, #8, r10, asr #7
-
- ldrneb r9, [r0, #-1]
- strb r10, [r1], #1
- ldrneb r10, [r0], #2
-
- bne width_loop_1st_only_6
-
- ldr lr, [sp] ; load back output pitch
- ldr r12, [sp, #4] ; load back output pitch
- subs r7, r7, #1
- add r0, r0, r12 ; updata src for next loop
-
- add r11, r12, r3 ; preload next low
- add r11, r11, #2
- pld [r0, r11]
-
- add r1, r1, lr ; update dst for next loop
-
- bne height_loop_1st_only_6
-
- add sp, sp, #8
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_filter_block2d_first_pass_only_armv6|
-
-
-;------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 unsigned char *output_ptr,
-; r2 unsigned int src_pixels_per_line
-; r3 unsigned int cnt,
-; stack unsigned int output_pitch,
-; stack const short *vp9_filter
-;------------------------------------
-|vp9_filter_block2d_second_pass_only_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
-
- ldr r11, [sp, #40] ; VFilter address
- ldr r12, [sp, #36] ; output pitch
-
- mov r7, r3, lsl #16 ; height is top part of counter
- sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
-
- sub sp, sp, #8
-
- ldr r4, [r11] ; load up packed filter coefficients
- ldr r5, [r11, #4]
- ldr r6, [r11, #8]
-
- str r0, [sp] ; save r0 to stack
- str r1, [sp, #4] ; save dst to stack
-
-; six tap filter
-|width_loop_2nd_only_6|
- ldrb r8, [r0], r2 ; load data
- orr r7, r7, r3 ; loop counter
- ldrb r9, [r0], r2
- ldrb r10, [r0], r2
-
-|height_loop_2nd_only_6|
- ; filter first column in this inner loop, than, move to next colum.
- ldrb r11, [r0], r2
-
- pkhbt lr, r8, r9, lsl #16 ; r9 | r8
- pkhbt r8, r9, r10, lsl #16 ; r10 | r9
-
- ldrb r9, [r0], r2
-
- smuad lr, lr, r4
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
- smuad r8, r8, r4
- pkhbt r11, r11, r9, lsl #16 ; r9 | r11
-
- smlad lr, r10, r5, lr
- ldrb r10, [r0], r2
- smlad r8, r11, r5, r8
- ldrb r11, [r0]
-
- sub r7, r7, #2
- sub r0, r0, r2, lsl #2
-
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
- pkhbt r10, r10, r11, lsl #16 ; r11 | r10
-
- smlad lr, r9, r6, lr
- smlad r10, r10, r6, r8
-
- ands r9, r7, #0xff
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- ldrneb r8, [r0], r2 ; load data for next loop
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r1], r12 ; store the result for the column
- usat r10, #8, r10, asr #7
-
- ldrneb r9, [r0], r2
- strb r10, [r1], r12
- ldrneb r10, [r0], r2
-
- bne height_loop_2nd_only_6
-
- ldr r0, [sp]
- ldr r1, [sp, #4]
- subs r7, r7, #0x10000
- add r0, r0, #1 ; move to filter next column
- str r0, [sp]
- add r1, r1, #1
- str r1, [sp, #4]
-
- bne width_loop_2nd_only_6
-
- add sp, sp, #8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_filter_block2d_second_pass_only_armv6|
-
- END
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ /dev/null
@@ -1,345 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
- EXPORT |vp8_short_idct4x4llm_1_v6|
- EXPORT |vp8_short_idct4x4llm_v6|
- EXPORT |vp8_short_idct4x4llm_v6_scott|
- EXPORT |vp8_short_idct4x4llm_v6_dual|
-
- AREA |.text|, CODE, READONLY
-
-;********************************************************************************
-;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench: 3/5
-;********************************************************************************
-
-|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
- ;
- ldrsh r0, [r0] ; load input[0] 1, r0 un 2
- add r0, r0, #4 ; 1 +4
- stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
- mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
- pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
- mov r5, r4 ; expand expand
-
- strd r4, [r1], r2 ; *output = r0, post inc 1
- strd r4, [r1], r2 ; 1
- strd r4, [r1], r2 ; 1
- strd r4, [r1] ; 1
- ;
- ldmia sp!, {r4, r5, pc} ; replace vars, return restore
- ENDP ; |vp8_short_idct4x4llm_1_v6|
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
- ;
- stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
- ;
- mov r4, #0x00004E00 ; 1 cst
- orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
- mov r5, #0x00008A00 ; 1 cst
- orr r5, r5, #0x0000008C ; sinpi8sqrt2
- ;
- mov r6, #4 ; i=4 1 i
-loop1 ;
- ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
- ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
- ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
- ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
- smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
- smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
- add r9, r7, r8 ; a1 = [0] + [8] 1 a1
- sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
- add r11, r3, r11 ; temp2 1
- rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
- smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
- smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
- add r8, r7, r11 ; b1 + c1 1 b+c
- strh r8, [r1, r2] ; out[pitch] = b1+c1 1
- sub r7, r7, r11 ; b1 - c1 1 b-c
- add r10, r12, r10 ; temp1 1
- add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
- add r10, r9, r3 ; a1 + d1 1 a+d
- sub r3, r9, r3 ; a1 - d1 1 a-d
- add r8, r2, r2 ; pitch * 2 1 p*2
- strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
- add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
- strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
- subs r6, r6, #1 ; i-- 1 --
- strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
- bne loop1 ; if i>0, continue
- ;
- sub r1, r1, #8 ; set up out for next loop 1 -4
- ; for this iteration, input=prev output
- mov r6, #4 ; i=4 1 i
-; b returnfull
-loop2 ;
- ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
- ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
- ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
- ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
- smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
- smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
- add r7, r0, r3 ; a1 = [0] + [2] 1 a1
- sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
- add r10, r8, r10 ; temp2 1
- rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
- smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
- smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
- add r3, r0, r9 ; b1+c1 1 b+c
- add r3, r3, #4 ; b1+c1+4 1 +4
- add r10, r11, r10 ; temp1 1
- mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
- strh r3, [r1, #2] ; out[1] = b1+c1 1
- add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
- add r3, r7, r10 ; a1+d1 1 a+d
- add r3, r3, #4 ; a1+d1+4 1 +4
- sub r7, r7, r10 ; a1-d1 1 a-d
- add r7, r7, #4 ; a1-d1+4 1 +4
- mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
- mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
- strh r7, [r1, #6] ; out[3] = a1-d1 1
- sub r0, r0, r9 ; b1-c1 1 b-c
- add r0, r0, #4 ; b1-c1+4 1 +4
- subs r6, r6, #1 ; i-- 1 --
- mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
- strh r0, [r1, #4] ; out[2] = b1-c1 1
- strh r3, [r1], r2 ; out[0] = a1+d1 1
-; add r1, r1, r2 ; out += pitch 1 ++
- bne loop2 ; if i>0, continue
-returnfull ;
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
- ENDP
-
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
-; mov r0, #0 ;
-; ldr r0, [r0] ;
- stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
- ;
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- ;
- mov r5, #0x2 ; i i
- ;
-short_idct4x4llm_v6_scott_loop1 ;
- ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
- ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
- ;
- smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
- smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
- ;
- smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
- smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
- ;
- add r6, r6, r7 ; partial c1 lt1-lt2
- add r12, r12, r14 ; partial d1 l2t2+l2t1
- ;
- smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
- smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
- ;
- smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
- smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
- ;
- add r7, r14, r7 ; partial c1_2 ht1+ht2
- sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
- ;
- pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
- pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
- ;
- usub16 r6, r6, r10 ; c1_2 | c1_1 c
- uadd16 r12, r12, r11 ; d1_2 | d1_1 d
- ;
- ldr r10, [r0, #0] ; i1 | i0 1,0
- ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
- ;
-;;;;;; add r0, r0, #0x4 ; +4
-;;;;;; add r1, r1, #0x4 ; +4
- ;
- uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
- usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
- ;
- uadd16 r7, r8, r12 ; a1 + d1 pair a+d
- usub16 r14, r8, r12 ; a1 - d1 pair a-d
- ;
- str r7, [r1] ; op[0] = a1 + d1
- str r14, [r1, r2] ; op[pitch*3] = a1 - d1
- ;
- add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
- add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
- ;
- subs r5, r5, #0x1 ; --
- bne short_idct4x4llm_v6_scott_loop1 ;
- ;
- sub r1, r1, #16 ; reset output ptr
- mov r5, #0x4 ;
- mov r0, r1 ; input = output
- ;
-short_idct4x4llm_v6_scott_loop2 ;
- ;
- subs r5, r5, #0x1 ;
- bne short_idct4x4llm_v6_scott_loop2 ;
- ;
- ldmia sp!, {r4 - r11, pc} ;
- ENDP ;
- ;
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
- ;
- stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- mov r5, #0x2 ; i=2 i
-loop1_dual
- ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
- ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
- ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
-
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
- pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
- smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
- pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
- uadd16 r6, r6, r7 ; 5c+5 | 4c+4
- smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
- smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
- smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
- subs r5, r5, #0x1 ; i-- --
- pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
- ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
- pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
- uadd16 r7, r12, r9 ; 13c+13 | 12c+12
- usub16 r7, r8, r7 ; c c
- uadd16 r6, r6, r10 ; d d
- uadd16 r10, r11, r14 ; a a
- usub16 r8, r11, r14 ; b b
- uadd16 r9, r10, r6 ; a+d a+d
- usub16 r10, r10, r6 ; a-d a-d
- uadd16 r6, r8, r7 ; b+c b+c
- usub16 r7, r8, r7 ; b-c b-c
- str r6, [r1, r2] ; o5 | o4
- add r6, r2, r2 ; pitch * 2 p2
- str r7, [r1, r6] ; o9 | o8
- add r6, r6, r2 ; pitch * 3 p3
- str r10, [r1, r6] ; o13 | o12
- str r9, [r1], #0x4 ; o1 | o0 ++
- bne loop1_dual ;
- mov r5, #0x2 ; i=2 i
- sub r0, r1, #8 ; reset input/output i/o
-loop2_dual
- ldr r6, [r0, r2] ; i5 | i4 5|4
- ldr r1, [r0] ; i1 | i0 1|0
- ldr r12, [r0, #0x4] ; i3 | i2 3|2
- add r14, r2, #0x4 ; pitch + 2 p+2
- ldr r14, [r0, r14] ; i7 | i6 7|6
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
- pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
- pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
- pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 � tc1
- pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
- uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
- pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
- uadd16 r10, r11, r9 ; a a
- usub16 r9, r11, r9 ; b b
- pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
- subs r5, r5, #0x1 ; i-- --
- smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
- smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
- smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
- smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
-
- pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
- pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
- uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
- usub16 r12, r8, r6 ; c (o1 | o5) c
- uadd16 r6, r11, r1 ; d (o3 | o7) d
- uadd16 r7, r10, r6 ; a+d a+d
- mov r8, #0x4 ; set up 4's 4
- orr r8, r8, #0x40000 ; 4|4
- usub16 r6, r10, r6 ; a-d a-d
- uadd16 r6, r6, r8 ; a-d+4 3|7
- uadd16 r7, r7, r8 ; a+d+4 0|4
- uadd16 r10, r9, r12 ; b+c b+c
- usub16 r1, r9, r12 ; b-c b-c
- uadd16 r10, r10, r8 ; b+c+4 1|5
- uadd16 r1, r1, r8 ; b-c+4 2|6
- mov r8, r10, asr #19 ; o1 >> 3
- strh r8, [r0, #2] ; o1
- mov r8, r1, asr #19 ; o2 >> 3
- strh r8, [r0, #4] ; o2
- mov r8, r6, asr #19 ; o3 >> 3
- strh r8, [r0, #6] ; o3
- mov r8, r7, asr #19 ; o0 >> 3
- strh r8, [r0], r2 ; o0 +p
- sxth r10, r10 ;
- mov r8, r10, asr #3 ; o5 >> 3
- strh r8, [r0, #2] ; o5
- sxth r1, r1 ;
- mov r8, r1, asr #3 ; o6 >> 3
- strh r8, [r0, #4] ; o6
- sxth r6, r6 ;
- mov r8, r6, asr #3 ; o7 >> 3
- strh r8, [r0, #6] ; o7
- sxth r7, r7 ;
- mov r8, r7, asr #3 ; o4 >> 3
- strh r8, [r0], r2 ; o4 +p
-;;;;; subs r5, r5, #0x1 ; i-- --
- bne loop2_dual ;
- ;
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
- ENDP
-
- END
--- a/vp8/common/arm/armv6/iwalsh_v6.asm
+++ /dev/null
@@ -1,152 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_short_inv_walsh4x4_v6|
- EXPORT |vp8_short_inv_walsh4x4_1_v6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_v6| PROC
-
- stmdb sp!, {r4 - r11, lr}
-
- ldr r2, [r0], #4 ; [1 | 0]
- ldr r3, [r0], #4 ; [3 | 2]
- ldr r4, [r0], #4 ; [5 | 4]
- ldr r5, [r0], #4 ; [7 | 6]
- ldr r6, [r0], #4 ; [9 | 8]
- ldr r7, [r0], #4 ; [11 | 10]
- ldr r8, [r0], #4 ; [13 | 12]
- ldr r9, [r0] ; [15 | 14]
-
- qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
- qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
- qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
- qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
-
- qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
- qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
- qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
- qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
-
- qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
- qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
- qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
- qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
-
- qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
- qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
- qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
- qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
-
- ; first transform complete
-
- qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
- qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
- qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
- qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
-
- qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
- ldr r10, c0x00030003
- qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
-
- qadd16 r2, r2, r10 ; [b2+3|c2+3]
- qadd16 r3, r3, r10 ; [a2+3|d2+3]
- qadd16 r4, r4, r10 ; [b2+3|c2+3]
- qadd16 r5, r5, r10 ; [a2+3|d2+3]
-
- asr r12, r2, #3 ; [1 | x]
- pkhtb r12, r12, r3, asr #19; [1 | 0]
- lsl lr, r3, #16 ; [~3 | x]
- lsl r2, r2, #16 ; [~2 | x]
- asr lr, lr, #3 ; [3 | x]
- pkhtb lr, lr, r2, asr #19 ; [3 | 2]
-
- asr r2, r4, #3 ; [5 | x]
- pkhtb r2, r2, r5, asr #19 ; [5 | 4]
- lsl r3, r5, #16 ; [~7 | x]
- lsl r4, r4, #16 ; [~6 | x]
- asr r3, r3, #3 ; [7 | x]
- pkhtb r3, r3, r4, asr #19 ; [7 | 6]
-
- str r12, [r1], #4
- str lr, [r1], #4
- str r2, [r1], #4
- str r3, [r1], #4
-
- qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
- qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
- qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
- qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
-
- qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
- qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
- qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
-
- qadd16 r6, r6, r10 ; [b2+3|c2+3]
- qadd16 r7, r7, r10 ; [a2+3|d2+3]
- qadd16 r8, r8, r10 ; [b2+3|c2+3]
- qadd16 r9, r9, r10 ; [a2+3|d2+3]
-
- asr r2, r6, #3 ; [9 | x]
- pkhtb r2, r2, r7, asr #19 ; [9 | 8]
- lsl r3, r7, #16 ; [~11| x]
- lsl r4, r6, #16 ; [~10| x]
- asr r3, r3, #3 ; [11 | x]
- pkhtb r3, r3, r4, asr #19 ; [11 | 10]
-
- asr r4, r8, #3 ; [13 | x]
- pkhtb r4, r4, r9, asr #19 ; [13 | 12]
- lsl r5, r9, #16 ; [~15| x]
- lsl r6, r8, #16 ; [~14| x]
- asr r5, r5, #3 ; [15 | x]
- pkhtb r5, r5, r6, asr #19 ; [15 | 14]
-
- str r2, [r1], #4
- str r3, [r1], #4
- str r4, [r1], #4
- str r5, [r1]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_short_inv_walsh4x4_v6|
-
-
-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_v6| PROC
-
- ldrsh r2, [r0] ; [0]
- add r2, r2, #3 ; [0] + 3
- asr r2, r2, #3 ; a1 ([0]+3) >> 3
- lsl r2, r2, #16 ; [a1 | x]
- orr r2, r2, r2, lsr #16 ; [a1 | a1]
-
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1], #4
- str r2, [r1]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_v6|
-
-; Constant Pool
-c0x00030003 DCD 0x00030003
- END
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ /dev/null
@@ -1,1282 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_loop_filter_horizontal_edge_armv6|
- EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|
- EXPORT |vp9_loop_filter_vertical_edge_armv6|
- EXPORT |vp9_mbloop_filter_vertical_edge_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
- MACRO
- TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
- ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
- ; a0: 03 02 01 00
- ; a1: 13 12 11 10
- ; a2: 23 22 21 20
- ; a3: 33 32 31 30
- ; b3 b2 b1 b0
-
- uxtb16 $b1, $a1 ; xx 12 xx 10
- uxtb16 $b0, $a0 ; xx 02 xx 00
- uxtb16 $b3, $a3 ; xx 32 xx 30
- uxtb16 $b2, $a2 ; xx 22 xx 20
- orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
- orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
-
- uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
- uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
- uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
- uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
- orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
- orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
-
- pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
- pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
-
- pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
- pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
- MEND
-
-
-src RN r0
-pstep RN r1
-count RN r5
-
-;r0 unsigned char *src_ptr,
-;r1 int src_pixel_step,
-;r2 const char *blimit,
-;r3 const char *limit,
-;stack const char *thresh,
-;stack int count
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r6, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r9, [src], pstep ; p3
- ldrb r4, [r2] ; blimit
- ldr r10, [src], pstep ; p2
- ldrb r2, [r3] ; limit
- ldr r11, [src], pstep ; p1
- orr r4, r4, r4, lsl #8
- ldrb r3, [r6] ; thresh
- orr r2, r2, r2, lsl #8
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|Hnext8|
- ; vp9_filter_mask() function
- ; calculate breakout conditions
- ldr r12, [src], pstep ; p0
-
- uqsub8 r6, r9, r10 ; p3 - p2
- uqsub8 r7, r10, r9 ; p2 - p3
- uqsub8 r8, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
-
- orr r6, r6, r7 ; abs (p3-p2)
- orr r8, r8, r10 ; abs (p2-p1)
- uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask
- uqsub8 r8, r8, r2 ; compare to limit
- uqsub8 r6, r11, r12 ; p1 - p0
- orr lr, lr, r8
- uqsub8 r7, r12, r11 ; p0 - p1
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
- orr r6, r6, r7 ; abs (p1-p0)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
- orr lr, lr, r7
-
- uqsub8 r6, r11, r10 ; p1 - q1
- uqsub8 r7, r10, r11 ; q1 - p1
- uqsub8 r11, r12, r9 ; p0 - q0
- uqsub8 r12, r9, r12 ; q0 - p0
- orr r6, r6, r7 ; abs (p1-q1)
- ldr r7, c0x7F7F7F7F
- orr r12, r11, r12 ; abs (p0-q0)
- ldr r11, [src], pstep ; q2
- uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
- and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r7, r9, r10 ; q0 - q1
- uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r6, r10, r9 ; q1 - q0
- uqsub8 r12, r12, r4 ; compare to flimit
- uqsub8 r9, r11, r10 ; q2 - q1
-
- orr lr, lr, r12
-
- ldr r12, [src], pstep ; q3
- uqsub8 r10, r10, r11 ; q1 - q2
- orr r6, r7, r6 ; abs (q1-q0)
- orr r10, r9, r10 ; abs (q2-q1)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r10, r10, r2 ; compare to limit
- uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
- orr lr, lr, r7
- orr lr, lr, r10
-
- uqsub8 r10, r12, r11 ; q3 - q2
- uqsub8 r9, r11, r12 ; q2 - q3
-
- mvn r11, #0 ; r11 == -1
-
- orr r10, r10, r9 ; abs (q3-q2)
- uqsub8 r10, r10, r2 ; compare to limit
-
- mov r12, #0
- orr lr, lr, r10
- sub src, src, pstep, lsl #2
-
- usub8 lr, r12, lr ; use usub8 instead of ssub8
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq hskip_filter ; skip filtering
-
- sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines
-
- ;vp8_hevmask() function
- ;calculate high edge variance
- orr r10, r6, r8 ; calculate vp8_hevmask
-
- ldr r7, [src], pstep ; p1
-
- usub8 r10, r12, r10 ; use usub8 instead of ssub8
- sel r6, r12, r11 ; obtain vp8_hevmask: r6
-
- ;vp9_filter() function
- ldr r8, [src], pstep ; p0
- ldr r12, c0x80808080
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
-
- eor r7, r7, r12 ; p1 offset to convert to a signed value
- eor r8, r8, r12 ; p0 offset to convert to a signed value
- eor r9, r9, r12 ; q0 offset to convert to a signed value
- eor r10, r10, r12 ; q1 offset to convert to a signed value
-
- str r9, [sp] ; store qs0 temporarily
- str r8, [sp, #4] ; store ps0 temporarily
- str r10, [sp, #8] ; store qs1 temporarily
- str r7, [sp, #12] ; store ps1 temporarily
-
- qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
- qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-
- and r7, r7, r6 ; vp9_filter (r7) &= hev
-
- qadd8 r7, r7, r8
- ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
-
- qadd8 r7, r7, r8
- ldr r10, c0x04040404
-
- qadd8 r7, r7, r8
- and r7, r7, lr ; vp9_filter &= mask;
-
- ;modify code for vp8 -- Filter1 = vp9_filter (r7)
- qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
- qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
-
- mov r9, #0
- shadd8 r8 , r8 , r9 ; Filter2 >>= 3
- shadd8 r7 , r7 , r9 ; vp9_filter >>= 3
- shadd8 r8 , r8 , r9
- shadd8 r7 , r7 , r9
- shadd8 lr , r8 , r9 ; lr: Filter2
- shadd8 r7 , r7 , r9 ; r7: filter
-
- ;usub8 lr, r8, r10 ; s = (s==4)*-1
- ;sel lr, r11, r9
- ;usub8 r8, r10, r8
- ;sel r8, r11, r9
- ;and r8, r8, lr ; -1 for each element that equals 4
-
- ;calculate output
- ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter)
-
- ldr r8, [sp] ; load qs0
- ldr r9, [sp, #4] ; load ps0
-
- ldr r10, c0x01010101
-
- qsub8 r8 ,r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
- qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2)
-
- ;end of modification for vp8
-
- mov lr, #0
- sadd8 r7, r7 , r10 ; vp9_filter += 1
- shadd8 r7, r7, lr ; vp9_filter >>= 1
-
- ldr r11, [sp, #12] ; load ps1
- ldr r10, [sp, #8] ; load qs1
-
- bic r7, r7, r6 ; vp9_filter &= ~hev
- sub src, src, pstep, lsl #2
-
- qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
- qsub8 r10, r10,r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
-
- eor r11, r11, r12 ; *op1 = u^0x80
- str r11, [src], pstep ; store op1
- eor r9, r9, r12 ; *op0 = u^0x80
- str r9, [src], pstep ; store op0 result
- eor r8, r8, r12 ; *oq0 = u^0x80
- str r8, [src], pstep ; store oq0 result
- eor r10, r10, r12 ; *oq1 = u^0x80
- str r10, [src], pstep ; store oq1
-
- sub src, src, pstep, lsl #1
-
-|hskip_filter|
- add src, src, #4
- sub src, src, pstep, lsl #2
-
- subs count, count, #1
-
- ldrne r9, [src], pstep ; p3
- ldrne r10, [src], pstep ; p2
- ldrne r11, [src], pstep ; p1
-
- bne Hnext8
-
- add sp, sp, #16
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_loop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r6, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r9, [src], pstep ; p3
- ldrb r4, [r2] ; blimit
- ldr r10, [src], pstep ; p2
- ldrb r2, [r3] ; limit
- ldr r11, [src], pstep ; p1
- orr r4, r4, r4, lsl #8
- ldrb r3, [r6] ; thresh
- orr r2, r2, r2, lsl #8
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|MBHnext8|
-
- ; vp9_filter_mask() function
- ; calculate breakout conditions
- ldr r12, [src], pstep ; p0
-
- uqsub8 r6, r9, r10 ; p3 - p2
- uqsub8 r7, r10, r9 ; p2 - p3
- uqsub8 r8, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
-
- orr r6, r6, r7 ; abs (p3-p2)
- orr r8, r8, r10 ; abs (p2-p1)
- uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask
- uqsub8 r8, r8, r2 ; compare to limit
-
- uqsub8 r6, r11, r12 ; p1 - p0
- orr lr, lr, r8
- uqsub8 r7, r12, r11 ; p0 - p1
- ldr r9, [src], pstep ; q0
- ldr r10, [src], pstep ; q1
- orr r6, r6, r7 ; abs (p1-p0)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
- orr lr, lr, r7
-
- uqsub8 r6, r11, r10 ; p1 - q1
- uqsub8 r7, r10, r11 ; q1 - p1
- uqsub8 r11, r12, r9 ; p0 - q0
- uqsub8 r12, r9, r12 ; q0 - p0
- orr r6, r6, r7 ; abs (p1-q1)
- ldr r7, c0x7F7F7F7F
- orr r12, r11, r12 ; abs (p0-q0)
- ldr r11, [src], pstep ; q2
- uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
- and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r7, r9, r10 ; q0 - q1
- uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r6, r10, r9 ; q1 - q0
- uqsub8 r12, r12, r4 ; compare to flimit
- uqsub8 r9, r11, r10 ; q2 - q1
-
- orr lr, lr, r12
-
- ldr r12, [src], pstep ; q3
-
- uqsub8 r10, r10, r11 ; q1 - q2
- orr r6, r7, r6 ; abs (q1-q0)
- orr r10, r9, r10 ; abs (q2-q1)
- uqsub8 r7, r6, r2 ; compare to limit
- uqsub8 r10, r10, r2 ; compare to limit
- uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
- orr lr, lr, r7
- orr lr, lr, r10
-
- uqsub8 r10, r12, r11 ; q3 - q2
- uqsub8 r9, r11, r12 ; q2 - q3
-
- mvn r11, #0 ; r11 == -1
-
- orr r10, r10, r9 ; abs (q3-q2)
- uqsub8 r10, r10, r2 ; compare to limit
-
- mov r12, #0
-
- orr lr, lr, r10
-
- usub8 lr, r12, lr ; use usub8 instead of ssub8
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq mbhskip_filter ; skip filtering
-
- ;vp8_hevmask() function
- ;calculate high edge variance
- sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines
- sub src, src, pstep, lsl #1
-
- orr r10, r6, r8
- ldr r7, [src], pstep ; p1
-
- usub8 r10, r12, r10
- sel r6, r12, r11 ; hev mask: r6
-
- ;vp8_mbfilter() function
- ;p2, q2 are only needed at the end. Don't need to load them in now.
- ldr r8, [src], pstep ; p0
- ldr r12, c0x80808080
- ldr r9, [src], pstep ; q0
- ldr r10, [src] ; q1
-
- eor r7, r7, r12 ; ps1
- eor r8, r8, r12 ; ps0
- eor r9, r9, r12 ; qs0
- eor r10, r10, r12 ; qs1
-
- qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
- str r7, [sp, #12] ; store ps1 temporarily
- qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
- str r10, [sp, #8] ; store qs1 temporarily
- qadd8 r7, r7, r12
- str r9, [sp] ; store qs0 temporarily
- qadd8 r7, r7, r12
- str r8, [sp, #4] ; store ps0 temporarily
- qadd8 r7, r7, r12 ; vp9_filter: r7
-
- ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
- ldr r9, c0x04040404
-
- and r7, r7, lr ; vp9_filter &= mask (lr is free)
-
- mov r12, r7 ; Filter2: r12
- and r12, r12, r6 ; Filter2 &= hev
-
- ;modify code for vp8
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
- qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
-
- mov r10, #0
- shadd8 r8 , r8 , r10 ; Filter1 >>= 3
- shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- shadd8 r8 , r8 , r10
- shadd8 r12 , r12 , r10
- shadd8 r8 , r8 , r10 ; r8: Filter1
- shadd8 r12 , r12 , r10 ; r12: Filter2
-
- ldr r9, [sp] ; load qs0
- ldr r11, [sp, #4] ; load ps0
-
- qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
- qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
-
- ;save bottom 3 bits so that we round one side +4 and the other +3
- ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
- ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4)
- ;mov r10, #0
- ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- ;usub8 lr, r8, r9 ; s = (s==4)*-1
- ;sel lr, r11, r10
- ;shadd8 r12 , r12 , r10
- ;usub8 r8, r9, r8
- ;sel r8, r11, r10
- ;ldr r9, [sp] ; load qs0
- ;ldr r11, [sp, #4] ; load ps0
- ;shadd8 r12 , r12 , r10
- ;and r8, r8, lr ; -1 for each element that equals 4
- ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2)
- ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
- ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u)
-
- ;end of modification for vp8
-
- bic r12, r7, r6 ; vp9_filter &= ~hev ( r6 is free)
- ;mov r12, r7
-
- ;roughly 3/7th difference across boundary
- mov lr, #0x1b ; 27
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u)
- eor r8, r8, lr ; *oq0 = s^0x80
- str r8, [src] ; store *oq0
- sub src, src, pstep
- eor r10, r10, lr ; *op0 = s^0x80
- str r10, [src] ; store *op0
-
- ;roughly 2/7th difference across boundary
- mov lr, #0x12 ; 18
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r9, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r9, #8, r9, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r9, r10, lsl #16
-
- ldr r9, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- qadd8 r11, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u)
- qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u)
- eor r11, r11, lr ; *op1 = s^0x80
- str r11, [src], pstep ; store *op1
- eor r8, r8, lr ; *oq1 = s^0x80
- add src, src, pstep, lsl #1
-
- mov r7, #0x3f ; 63
-
- str r8, [src], pstep ; store *oq1
-
- ;roughly 1/7th difference across boundary
- mov lr, #0x9 ; 9
- ldr r9, [src] ; load q2
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r12, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r12, #8, r12, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r12, r10, lsl #16
-
- sub src, src, pstep
- ldr lr, c0x80808080
-
- ldr r11, [src] ; load p2
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- eor r9, r9, lr
- eor r11, r11, lr
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u)
- qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u)
- eor r8, r8, lr ; *op2 = s^0x80
- str r8, [src], pstep, lsl #2 ; store *op2
- add src, src, pstep
- eor r10, r10, lr ; *oq2 = s^0x80
- str r10, [src], pstep, lsl #1 ; store *oq2
-
-|mbhskip_filter|
- add src, src, #4
- sub src, src, pstep, lsl #3
- subs count, count, #1
-
- ldrne r9, [src], pstep ; p3
- ldrne r10, [src], pstep ; p2
- ldrne r11, [src], pstep ; p1
-
- bne MBHnext8
-
- add sp, sp, #16
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, #4 ; move src pointer down by 4
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r12, [sp, #36] ; load thresh address
- sub sp, sp, #16 ; create temp buffer
-
- ldr r6, [src], pstep ; load source data
- ldrb r4, [r2] ; blimit
- ldr r7, [src], pstep
- ldrb r2, [r3] ; limit
- ldr r8, [src], pstep
- orr r4, r4, r4, lsl #8
- ldrb r3, [r12] ; thresh
- orr r2, r2, r2, lsl #8
- ldr lr, [src], pstep
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|Vnext8|
-
- ; vp9_filter_mask() function
- ; calculate breakout conditions
- ; transpose the source data for 4-in-parallel operation
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- uqsub8 r7, r9, r10 ; p3 - p2
- uqsub8 r8, r10, r9 ; p2 - p3
- uqsub8 r9, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
- orr r7, r7, r8 ; abs (p3-p2)
- orr r10, r9, r10 ; abs (p2-p1)
- uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask
- uqsub8 r10, r10, r2 ; compare to limit
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr lr, lr, r10
-
- uqsub8 r6, r11, r12 ; p1 - p0
- uqsub8 r7, r12, r11 ; p0 - p1
- add src, src, #4 ; move src pointer up by 4
- orr r6, r6, r7 ; abs (p1-p0)
- str r11, [sp, #12] ; save p1
- uqsub8 r10, r6, r2 ; compare to limit
- uqsub8 r11, r6, r3 ; compare to thresh
- orr lr, lr, r10
-
- ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
- ; transpose the source data for 4-in-parallel operation
- ldr r6, [src], pstep ; load source data
- str r11, [sp] ; push r11 to stack
- ldr r7, [src], pstep
- str r12, [sp, #4] ; save current reg before load q0 - q3 data
- ldr r8, [src], pstep
- str lr, [sp, #8]
- ldr lr, [src], pstep
-
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- ldr lr, [sp, #8] ; load back (f)limit accumulator
-
- uqsub8 r6, r12, r11 ; q3 - q2
- uqsub8 r7, r11, r12 ; q2 - q3
- uqsub8 r12, r11, r10 ; q2 - q1
- uqsub8 r11, r10, r11 ; q1 - q2
- orr r6, r6, r7 ; abs (q3-q2)
- orr r7, r12, r11 ; abs (q2-q1)
- uqsub8 r6, r6, r2 ; compare to limit
- uqsub8 r7, r7, r2 ; compare to limit
- ldr r11, [sp, #4] ; load back p0
- ldr r12, [sp, #12] ; load back p1
- orr lr, lr, r6
- orr lr, lr, r7
-
- uqsub8 r6, r11, r9 ; p0 - q0
- uqsub8 r7, r9, r11 ; q0 - p0
- uqsub8 r8, r12, r10 ; p1 - q1
- uqsub8 r11, r10, r12 ; q1 - p1
- orr r6, r6, r7 ; abs (p0-q0)
- ldr r7, c0x7F7F7F7F
- orr r8, r8, r11 ; abs (p1-q1)
- uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
- and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r11, r10, r9 ; q1 - q0
- uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r12, r9, r10 ; q0 - q1
- uqsub8 r6, r6, r4 ; compare to flimit
-
- orr r9, r11, r12 ; abs (q1-q0)
- uqsub8 r8, r9, r2 ; compare to limit
- uqsub8 r10, r9, r3 ; compare to thresh
- orr lr, lr, r6
- orr lr, lr, r8
-
- mvn r11, #0 ; r11 == -1
- mov r12, #0
-
- usub8 lr, r12, lr
- ldr r9, [sp] ; load the compared result
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq vskip_filter ; skip filtering
-
- ;vp8_hevmask() function
- ;calculate high edge variance
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r9, r9, r10
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- usub8 r9, r12, r9
- sel r6, r12, r11 ; hev mask: r6
-
- ;vp9_filter() function
- ; load soure data to r6, r11, r12, lr
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- pkhbt r12, r7, r8, lsl #16
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- pkhbt r11, r9, r10, lsl #16
-
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
- str r6, [sp]
- str lr, [sp, #4]
-
- pkhbt r6, r7, r8, lsl #16
- pkhbt lr, r9, r10, lsl #16
-
- ;transpose r12, r11, r6, lr to r7, r8, r9, r10
- TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
- ;load back hev_mask r6 and filter_mask lr
- ldr r12, c0x80808080
- ldr r6, [sp]
- ldr lr, [sp, #4]
-
- eor r7, r7, r12 ; p1 offset to convert to a signed value
- eor r8, r8, r12 ; p0 offset to convert to a signed value
- eor r9, r9, r12 ; q0 offset to convert to a signed value
- eor r10, r10, r12 ; q1 offset to convert to a signed value
-
- str r9, [sp] ; store qs0 temporarily
- str r8, [sp, #4] ; store ps0 temporarily
- str r10, [sp, #8] ; store qs1 temporarily
- str r7, [sp, #12] ; store ps1 temporarily
-
- qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
- qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
-
- and r7, r7, r6 ; vp9_filter (r7) &= hev (r7 : filter)
-
- qadd8 r7, r7, r8
- ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
-
- qadd8 r7, r7, r8
- ldr r10, c0x04040404
-
- qadd8 r7, r7, r8
- ;mvn r11, #0 ; r11 == -1
-
- and r7, r7, lr ; vp9_filter &= mask
-
- ;modify code for vp8 -- Filter1 = vp9_filter (r7)
- qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
- qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
-
- mov r9, #0
- shadd8 r8 , r8 , r9 ; Filter2 >>= 3
- shadd8 r7 , r7 , r9 ; vp9_filter >>= 3
- shadd8 r8 , r8 , r9
- shadd8 r7 , r7 , r9
- shadd8 lr , r8 , r9 ; lr: filter2
- shadd8 r7 , r7 , r9 ; r7: filter
-
- ;usub8 lr, r8, r10 ; s = (s==4)*-1
- ;sel lr, r11, r9
- ;usub8 r8, r10, r8
- ;sel r8, r11, r9
- ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s
-
- ;calculate output
- ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter)
-
- ldr r8, [sp] ; load qs0
- ldr r9, [sp, #4] ; load ps0
-
- ldr r10, c0x01010101
-
- qsub8 r8, r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
- qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2)
- ;end of modification for vp8
-
- eor r8, r8, r12
- eor r9, r9, r12
-
- mov lr, #0
-
- sadd8 r7, r7, r10
- shadd8 r7, r7, lr
-
- ldr r10, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
-
- bic r7, r7, r6 ; r7: vp9_filter
-
- qsub8 r10 , r10, r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
- qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
- eor r10, r10, r12
- eor r11, r11, r12
-
- sub src, src, pstep, lsl #2
-
- ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
- ;output is b0, b1, b2, b3
- ;b0: 03 02 01 00
- ;b1: 13 12 11 10
- ;b2: 23 22 21 20
- ;b3: 33 32 31 30
- ; p1 p0 q0 q1
- ; (a3 a2 a1 a0)
- TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
-
- strh r6, [src, #-2] ; store the result
- mov r6, r6, lsr #16
- strh r6, [src], pstep
-
- strh r7, [src, #-2]
- mov r7, r7, lsr #16
- strh r7, [src], pstep
-
- strh r12, [src, #-2]
- mov r12, r12, lsr #16
- strh r12, [src], pstep
-
- strh lr, [src, #-2]
- mov lr, lr, lsr #16
- strh lr, [src], pstep
-
-|vskip_filter|
- sub src, src, #4
- subs count, count, #1
-
- ldrne r6, [src], pstep ; load source data
- ldrne r7, [src], pstep
- ldrne r8, [src], pstep
- ldrne lr, [src], pstep
-
- bne Vnext8
-
- add sp, sp, #16
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_loop_filter_vertical_edge_armv6|
-
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp8_mbloop_filter_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- sub src, src, #4 ; move src pointer down by 4
- ldr count, [sp, #40] ; count for 8-in-parallel
- ldr r12, [sp, #36] ; load thresh address
- pld [src, #23] ; preload for next block
- sub sp, sp, #16 ; create temp buffer
-
- ldr r6, [src], pstep ; load source data
- ldrb r4, [r2] ; blimit
- pld [src, #23]
- ldr r7, [src], pstep
- ldrb r2, [r3] ; limit
- pld [src, #23]
- ldr r8, [src], pstep
- orr r4, r4, r4, lsl #8
- ldrb r3, [r12] ; thresh
- orr r2, r2, r2, lsl #8
- pld [src, #23]
- ldr lr, [src], pstep
- mov count, count, lsl #1 ; 4-in-parallel
- orr r4, r4, r4, lsl #16
- orr r3, r3, r3, lsl #8
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
-
-|MBVnext8|
- ; vp9_filter_mask() function
- ; calculate breakout conditions
- ; transpose the source data for 4-in-parallel operation
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- uqsub8 r7, r9, r10 ; p3 - p2
- uqsub8 r8, r10, r9 ; p2 - p3
- uqsub8 r9, r10, r11 ; p2 - p1
- uqsub8 r10, r11, r10 ; p1 - p2
- orr r7, r7, r8 ; abs (p3-p2)
- orr r10, r9, r10 ; abs (p2-p1)
- uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask
- uqsub8 r10, r10, r2 ; compare to limit
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr lr, lr, r10
-
- uqsub8 r6, r11, r12 ; p1 - p0
- uqsub8 r7, r12, r11 ; p0 - p1
- add src, src, #4 ; move src pointer up by 4
- orr r6, r6, r7 ; abs (p1-p0)
- str r11, [sp, #12] ; save p1
- uqsub8 r10, r6, r2 ; compare to limit
- uqsub8 r11, r6, r3 ; compare to thresh
- orr lr, lr, r10
-
- ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
- ; transpose the source data for 4-in-parallel operation
- ldr r6, [src], pstep ; load source data
- str r11, [sp] ; push r11 to stack
- ldr r7, [src], pstep
- str r12, [sp, #4] ; save current reg before load q0 - q3 data
- ldr r8, [src], pstep
- str lr, [sp, #8]
- ldr lr, [src], pstep
-
-
- TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
-
- ldr lr, [sp, #8] ; load back (f)limit accumulator
-
- uqsub8 r6, r12, r11 ; q3 - q2
- uqsub8 r7, r11, r12 ; q2 - q3
- uqsub8 r12, r11, r10 ; q2 - q1
- uqsub8 r11, r10, r11 ; q1 - q2
- orr r6, r6, r7 ; abs (q3-q2)
- orr r7, r12, r11 ; abs (q2-q1)
- uqsub8 r6, r6, r2 ; compare to limit
- uqsub8 r7, r7, r2 ; compare to limit
- ldr r11, [sp, #4] ; load back p0
- ldr r12, [sp, #12] ; load back p1
- orr lr, lr, r6
- orr lr, lr, r7
-
- uqsub8 r6, r11, r9 ; p0 - q0
- uqsub8 r7, r9, r11 ; q0 - p0
- uqsub8 r8, r12, r10 ; p1 - q1
- uqsub8 r11, r10, r12 ; q1 - p1
- orr r6, r6, r7 ; abs (p0-q0)
- ldr r7, c0x7F7F7F7F
- orr r8, r8, r11 ; abs (p1-q1)
- uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
- and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
- uqsub8 r11, r10, r9 ; q1 - q0
- uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
- uqsub8 r12, r9, r10 ; q0 - q1
- uqsub8 r6, r6, r4 ; compare to flimit
-
- orr r9, r11, r12 ; abs (q1-q0)
- uqsub8 r8, r9, r2 ; compare to limit
- uqsub8 r10, r9, r3 ; compare to thresh
- orr lr, lr, r6
- orr lr, lr, r8
-
- mvn r11, #0 ; r11 == -1
- mov r12, #0
-
- usub8 lr, r12, lr
- ldr r9, [sp] ; load the compared result
- sel lr, r11, r12 ; filter mask: lr
-
- cmp lr, #0
- beq mbvskip_filter ; skip filtering
-
-
-
- ;vp8_hevmask() function
- ;calculate high edge variance
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r9, r9, r10
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- usub8 r9, r12, r9
- sel r6, r12, r11 ; hev mask: r6
-
-
- ; vp8_mbfilter() function
- ; p2, q2 are only needed at the end. Don't need to load them in now.
- ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
- ; load soure data to r6, r11, r12, lr
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- pkhbt r12, r7, r8, lsl #16
-
- ldrh r7, [src, #-2]
- ldrh r8, [src], pstep
-
- pkhbt r11, r9, r10, lsl #16
-
- ldrh r9, [src, #-2]
- ldrh r10, [src], pstep
-
- str r6, [sp] ; save r6
- str lr, [sp, #4] ; save lr
-
- pkhbt r6, r7, r8, lsl #16
- pkhbt lr, r9, r10, lsl #16
-
- ;transpose r12, r11, r6, lr to p1, p0, q0, q1
- TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
-
- ;load back hev_mask r6 and filter_mask lr
- ldr r12, c0x80808080
- ldr r6, [sp]
- ldr lr, [sp, #4]
-
- eor r7, r7, r12 ; ps1
- eor r8, r8, r12 ; ps0
- eor r9, r9, r12 ; qs0
- eor r10, r10, r12 ; qs1
-
- qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
- str r7, [sp, #12] ; store ps1 temporarily
- qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
- str r10, [sp, #8] ; store qs1 temporarily
- qadd8 r7, r7, r12
- str r9, [sp] ; store qs0 temporarily
- qadd8 r7, r7, r12
- str r8, [sp, #4] ; store ps0 temporarily
- qadd8 r7, r7, r12 ; vp9_filter: r7
-
- ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
- ldr r9, c0x04040404
- ;mvn r11, #0 ; r11 == -1
-
- and r7, r7, lr ; vp9_filter &= mask (lr is free)
-
- mov r12, r7 ; Filter2: r12
- and r12, r12, r6 ; Filter2 &= hev
-
- ;modify code for vp8
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
- qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
-
- mov r10, #0
- shadd8 r8 , r8 , r10 ; Filter1 >>= 3
- shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- shadd8 r8 , r8 , r10
- shadd8 r12 , r12 , r10
- shadd8 r8 , r8 , r10 ; r8: Filter1
- shadd8 r12 , r12 , r10 ; r12: Filter2
-
- ldr r9, [sp] ; load qs0
- ldr r11, [sp, #4] ; load ps0
-
- qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
- qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
-
- ;save bottom 3 bits so that we round one side +4 and the other +3
- ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
- ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4)
- ;mov r10, #0
- ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
- ;usub8 lr, r8, r9 ; s = (s==4)*-1
- ;sel lr, r11, r10
- ;shadd8 r12 , r12 , r10
- ;usub8 r8, r9, r8
- ;sel r8, r11, r10
- ;ldr r9, [sp] ; load qs0
- ;ldr r11, [sp, #4] ; load ps0
- ;shadd8 r12 , r12 , r10
- ;and r8, r8, lr ; -1 for each element that equals 4
- ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2)
- ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
- ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u)
-
- ;end of modification for vp8
-
- bic r12, r7, r6 ;vp9_filter &= ~hev ( r6 is free)
- ;mov r12, r7
-
- ;roughly 3/7th difference across boundary
- mov lr, #0x1b ; 27
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- ldr lr, c0x80808080
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u)
- eor r8, r8, lr ; *oq0 = s^0x80
- eor r10, r10, lr ; *op0 = s^0x80
-
- strb r10, [src, #-1] ; store op0 result
- strb r8, [src], pstep ; store oq0 result
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- strb r10, [src, #-1]
- strb r8, [src], pstep
-
- ;roughly 2/7th difference across boundary
- mov lr, #0x12 ; 18
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r9, r10, lr, r7
-
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r9, #8, r9, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r9, r10, lsl #16
-
- ldr r9, [sp, #8] ; load qs1
- ldr r11, [sp, #12] ; load ps1
- ldr lr, c0x80808080
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- add src, src, #2
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u)
- qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u)
- eor r8, r8, lr ; *oq1 = s^0x80
- eor r10, r10, lr ; *op1 = s^0x80
-
- ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary
- strb r10, [src, #-4] ; store op1
- strb r8, [src, #-1] ; store oq1
- ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- orr r11, r11, r6, lsl #8
- orr r9, r9, r7, lsl #8
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
-
- mov r10, r10, lsr #8
- mov r8, r8, lsr #8
- orr r11, r11, r6, lsl #16
- orr r9, r9, r7, lsl #16
-
- ldrb r6, [src, #-5]
- strb r10, [src, #-4]
- strb r8, [src, #-1]
- ldrb r7, [src], pstep
- orr r11, r11, r6, lsl #24
- orr r9, r9, r7, lsl #24
-
- ;roughly 1/7th difference across boundary
- eor r9, r9, lr
- eor r11, r11, lr
-
- mov lr, #0x9 ; 9
- mov r7, #0x3f ; 63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r12, r10, lr, r7
- smlatb r10, r10, lr, r7
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- ssat r12, #8, r12, asr #7
- ssat r10, #8, r10, asr #7
-
- sub src, src, pstep, lsl #2
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r12, r10, lsl #16
-
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- ldr lr, c0x80808080
-
- orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u)
- qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u)
- eor r8, r8, lr ; *op2 = s^0x80
- eor r10, r10, lr ; *oq2 = s^0x80
-
- strb r8, [src, #-5] ; store *op2
- strb r10, [src], pstep ; store *oq2
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
- mov r8, r8, lsr #8
- mov r10, r10, lsr #8
- strb r8, [src, #-5]
- strb r10, [src], pstep
-
- ;adjust src pointer for next loop
- sub src, src, #2
-
-|mbvskip_filter|
- sub src, src, #4
- subs count, count, #1
-
- pld [src, #23] ; preload for next block
- ldrne r6, [src], pstep ; load source data
- pld [src, #23]
- ldrne r7, [src], pstep
- pld [src, #23]
- ldrne r8, [src], pstep
- pld [src, #23]
- ldrne lr, [src], pstep
-
- bne MBVnext8
-
- add sp, sp, #16
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD 0x80808080
-c0x03030303 DCD 0x03030303
-c0x04040404 DCD 0x04040404
-c0x01010101 DCD 0x01010101
-c0x7F7F7F7F DCD 0x7F7F7F7F
-
- END
--- a/vp8/common/arm/armv6/recon_v6.asm
+++ /dev/null
@@ -1,281 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon_b_armv6|
- EXPORT |vp8_recon2b_armv6|
- EXPORT |vp8_recon4b_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-prd RN r0
-dif RN r1
-dst RN r2
-stride RN r3
-
-;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
-; R0 char* pred_ptr
-; R1 short * dif_ptr
-; R2 char * dst_ptr
-; R3 int stride
-
-; Description:
-; Loop through the block adding the Pred and Diff together. Clamp and then
-; store back into the Dst.
-
-; Restrictions :
-; all buffers are expected to be 4 byte aligned coming in and
-; going out.
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_recon_b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #8] ; 1 | 0
-;; ldr r7, [dif, #12] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #16] ; 1 | 0
-;; ldr r7, [dif, #20] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #24] ; 1 | 0
-;; ldr r7, [dif, #28] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |recon_b|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char *pred_ptr
-; R1 short *dif_ptr
-; R2 char *dst_ptr
-; R3 int stride
-|vp8_recon4b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- mov lr, #4
-
-recon4b_loop
- ;0, 1, 2, 3
- ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst]
-
- ;4, 5, 6, 7
- ldr r4, [prd], #4
-;; ldr r6, [dif, #32]
-;; ldr r7, [dif, #36]
- ldr r6, [dif, #8]
- ldr r7, [dif, #12]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #4]
-
- ;8, 9, 10, 11
- ldr r4, [prd], #4
-;; ldr r6, [dif, #64]
-;; ldr r7, [dif, #68]
- ldr r6, [dif, #16]
- ldr r7, [dif, #20]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #8]
-
- ;12, 13, 14, 15
- ldr r4, [prd], #4
-;; ldr r6, [dif, #96]
-;; ldr r7, [dif, #100]
- ldr r6, [dif, #24]
- ldr r7, [dif, #28]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #12]
-
- add dst, dst, stride
-;; add dif, dif, #8
- add dif, dif, #32
-
- subs lr, lr, #1
- bne recon4b_loop
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |Recon4B|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char *pred_ptr
-; R1 short *dif_ptr
-; R2 char *dst_ptr
-; R3 int stride
-|vp8_recon2b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- mov lr, #4
-
-recon2b_loop
- ;0, 1, 2, 3
- ldr r4, [prd], #4
- ldr r6, [dif, #0]
- ldr r7, [dif, #4]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst]
-
- ;4, 5, 6, 7
- ldr r4, [prd], #4
-;; ldr r6, [dif, #32]
-;; ldr r7, [dif, #36]
- ldr r6, [dif, #8]
- ldr r7, [dif, #12]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #4]
-
- add dst, dst, stride
-;; add dif, dif, #8
- add dif, dif, #16
-
- subs lr, lr, #1
- bne recon2b_loop
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |Recon2B|
-
- END
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
- EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
- MACRO
- TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
- ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
- ; a0: 03 02 01 00
- ; a1: 13 12 11 10
- ; a2: 23 22 21 20
- ; a3: 33 32 31 30
- ; b3 b2 b1 b0
-
- uxtb16 $b1, $a1 ; xx 12 xx 10
- uxtb16 $b0, $a0 ; xx 02 xx 00
- uxtb16 $b3, $a3 ; xx 32 xx 30
- uxtb16 $b2, $a2 ; xx 22 xx 20
- orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
- orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
-
- uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
- uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
- uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
- uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
- orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
- orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
-
- pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
- pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
-
- pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
- pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
- MEND
-
-
-
-src RN r0
-pstep RN r1
-
-;r0 unsigned char *src_ptr,
-;r1 int src_pixel_step,
-;r2 const char *blimit
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- ldrb r12, [r2] ; blimit
- ldr r3, [src, -pstep, lsl #1] ; p1
- ldr r4, [src, -pstep] ; p0
- ldr r5, [src] ; q0
- ldr r6, [src, pstep] ; q1
- orr r12, r12, r12, lsl #8 ; blimit
- ldr r2, c0x80808080
- orr r12, r12, r12, lsl #16 ; blimit
- mov r9, #4 ; double the count. we're doing 4 at a time
- mov lr, #0 ; need 0 in a couple places
-
-|simple_hnext8|
- ; vp8_simple_filter_mask()
-
- uqsub8 r7, r3, r6 ; p1 - q1
- uqsub8 r8, r6, r3 ; q1 - p1
- uqsub8 r10, r4, r5 ; p0 - q0
- uqsub8 r11, r5, r4 ; q0 - p0
- orr r8, r8, r7 ; abs(p1 - q1)
- orr r10, r10, r11 ; abs(p0 - q0)
- uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
- uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
- uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
- mvn r8, #0
- usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
- sel r10, r8, lr ; filter mask: F or 0
- cmp r10, #0
- beq simple_hskip_filter ; skip filtering if all masks are 0x00
-
- ;vp8_simple_filter()
-
- eor r3, r3, r2 ; p1 offset to convert to a signed value
- eor r6, r6, r2 ; q1 offset to convert to a signed value
- eor r4, r4, r2 ; p0 offset to convert to a signed value
- eor r5, r5, r2 ; q0 offset to convert to a signed value
-
- qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
- qsub8 r6, r5, r4 ; q0 - p0
- qadd8 r3, r3, r6 ; += q0 - p0
- ldr r7, c0x04040404
- qadd8 r3, r3, r6 ; += q0 - p0
- ldr r8, c0x03030303
- qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
- ;STALL
- and r3, r3, r10 ; vp9_filter &= mask
-
- qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4
- qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3
-
- shadd8 r7 , r7 , lr
- shadd8 r8 , r8 , lr
- shadd8 r7 , r7 , lr
- shadd8 r8 , r8 , lr
- shadd8 r7 , r7 , lr ; Filter1 >>= 3
- shadd8 r8 , r8 , lr ; Filter2 >>= 3
-
- qsub8 r5 ,r5, r7 ; u = q0 - Filter1
- qadd8 r4, r4, r8 ; u = p0 + Filter2
- eor r5, r5, r2 ; *oq0 = u^0x80
- str r5, [src] ; store oq0 result
- eor r4, r4, r2 ; *op0 = u^0x80
- str r4, [src, -pstep] ; store op0 result
-
-|simple_hskip_filter|
- subs r9, r9, #1
- addne src, src, #4 ; next row
-
- ldrne r3, [src, -pstep, lsl #1] ; p1
- ldrne r4, [src, -pstep] ; p0
- ldrne r5, [src] ; q0
- ldrne r6, [src, pstep] ; q1
-
- bne simple_hnext8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6|
-
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-|vp9_loop_filter_simple_vertical_edge_armv6| PROC
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
- stmdb sp!, {r4 - r11, lr}
-
- ldrb r12, [r2] ; r12: blimit
- ldr r2, c0x80808080
- orr r12, r12, r12, lsl #8
-
- ; load soure data to r7, r8, r9, r10
- ldrh r3, [src, #-2]
- pld [src, #23] ; preload for next block
- ldrh r4, [src], pstep
- orr r12, r12, r12, lsl #16
-
- ldrh r5, [src, #-2]
- pld [src, #23]
- ldrh r6, [src], pstep
-
- pkhbt r7, r3, r4, lsl #16
-
- ldrh r3, [src, #-2]
- pld [src, #23]
- ldrh r4, [src], pstep
-
- pkhbt r8, r5, r6, lsl #16
-
- ldrh r5, [src, #-2]
- pld [src, #23]
- ldrh r6, [src], pstep
- mov r11, #4 ; double the count. we're doing 4 at a time
-
-|simple_vnext8|
- ; vp8_simple_filter_mask() function
- pkhbt r9, r3, r4, lsl #16
- pkhbt r10, r5, r6, lsl #16
-
- ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
- TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
-
- uqsub8 r7, r3, r6 ; p1 - q1
- uqsub8 r8, r6, r3 ; q1 - p1
- uqsub8 r9, r4, r5 ; p0 - q0
- uqsub8 r10, r5, r4 ; q0 - p0
- orr r7, r7, r8 ; abs(p1 - q1)
- orr r9, r9, r10 ; abs(p0 - q0)
- mov r8, #0
- uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
- uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
- uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
- mvn r10, #0 ; r10 == -1
-
- usub8 r7, r12, r7 ; compare to flimit
- sel lr, r10, r8 ; filter mask
-
- cmp lr, #0
- beq simple_vskip_filter ; skip filtering
-
- ;vp8_simple_filter() function
- eor r3, r3, r2 ; p1 offset to convert to a signed value
- eor r6, r6, r2 ; q1 offset to convert to a signed value
- eor r4, r4, r2 ; p0 offset to convert to a signed value
- eor r5, r5, r2 ; q0 offset to convert to a signed value
-
- qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
- qsub8 r6, r5, r4 ; q0 - p0
-
- qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
- ldr r9, c0x03030303 ; r9 = 3
-
- qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
- ldr r7, c0x04040404
-
- qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
- ;STALL
- and r3, r3, lr ; vp9_filter &= mask
-
- qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3
- qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4
-
- shadd8 r9 , r9 , r8
- shadd8 r3 , r3 , r8
- shadd8 r9 , r9 , r8
- shadd8 r3 , r3 , r8
- shadd8 r9 , r9 , r8 ; Filter2 >>= 3
- shadd8 r3 , r3 , r8 ; Filter1 >>= 3
-
- ;calculate output
- sub src, src, pstep, lsl #2
-
- qadd8 r4, r4, r9 ; u = p0 + Filter2
- qsub8 r5, r5, r3 ; u = q0 - Filter1
- eor r4, r4, r2 ; *op0 = u^0x80
- eor r5, r5, r2 ; *oq0 = u^0x80
-
- strb r4, [src, #-1] ; store the result
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- mov r4, r4, lsr #8
- strb r5, [src], pstep
- mov r5, r5, lsr #8
-
- strb r4, [src, #-1]
- strb r5, [src], pstep
-
-|simple_vskip_filter|
- subs r11, r11, #1
-
- ; load soure data to r7, r8, r9, r10
- ldrneh r3, [src, #-2]
- pld [src, #23] ; preload for next block
- ldrneh r4, [src], pstep
-
- ldrneh r5, [src, #-2]
- pld [src, #23]
- ldrneh r6, [src], pstep
-
- pkhbt r7, r3, r4, lsl #16
-
- ldrneh r3, [src, #-2]
- pld [src, #23]
- ldrneh r4, [src], pstep
-
- pkhbt r8, r5, r6, lsl #16
-
- ldrneh r5, [src, #-2]
- pld [src, #23]
- ldrneh r6, [src], pstep
-
- bne simple_vnext8
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6|
-
-; Constant Pool
-c0x80808080 DCD 0x80808080
-c0x03030303 DCD 0x03030303
-c0x04040404 DCD 0x04040404
-
- END
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ /dev/null
@@ -1,273 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x4_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------------
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pitch
-;-------------------------------------
-;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
-;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
-;and the result is stored in transpose.
-|vp8_sixtap_predict8x4_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
- str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- add lr, sp, #4 ;point to temporary buffer
- beq skip_firstpass_filter
-
-;first-pass filter
- adr r12, filter8_coeff
- sub r0, r0, r1, lsl #1
-
- add r3, r1, #10 ; preload next low
- pld [r0, r3]
-
- add r2, r12, r2, lsl #4 ;calculate filter location
- add r0, r0, #3 ;adjust src only for loading convinience
-
- ldr r3, [r2] ; load up packed filter coefficients
- ldr r4, [r2, #4]
- ldr r5, [r2, #8]
-
- mov r2, #0x90000 ; height=9 is top part of counter
-
- sub r1, r1, #8
-
-|first_pass_hloop_v6|
- ldrb r6, [r0, #-5] ; load source data
- ldrb r7, [r0, #-4]
- ldrb r8, [r0, #-3]
- ldrb r9, [r0, #-2]
- ldrb r10, [r0, #-1]
-
- orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
-
- pkhbt r6, r6, r7, lsl #16 ; r7 | r6
- pkhbt r7, r7, r8, lsl #16 ; r8 | r7
-
- pkhbt r8, r8, r9, lsl #16 ; r9 | r8
- pkhbt r9, r9, r10, lsl #16 ; r10 | r9
-
-|first_pass_wloop_v6|
- smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1]
- smuad r12, r7, r3
-
- ldrb r6, [r0], #1
-
- smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3]
- ldrb r7, [r0], #1
- smlad r12, r9, r4, r12
-
- pkhbt r10, r10, r6, lsl #16 ; r10 | r9
- pkhbt r6, r6, r7, lsl #16 ; r11 | r10
- smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5]
- smlad r12, r6, r5, r12
-
- sub r2, r2, #1
-
- add r11, r11, #0x40 ; round_shift_and_clamp
- tst r2, #0xff ; test loop counter
- usat r11, #8, r11, asr #7
- add r12, r12, #0x40
- strh r11, [lr], #20 ; result is transposed and stored, which
- usat r12, #8, r12, asr #7
-
- strh r12, [lr], #20
-
- movne r11, r6
- movne r12, r7
-
- movne r6, r8
- movne r7, r9
- movne r8, r10
- movne r9, r11
- movne r10, r12
-
- bne first_pass_wloop_v6
-
- ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
- ;;IF ARCHITECTURE=6
- ;pld [src, ppl]
- ;;pld [src, r9]
- ;;ENDIF
-
- subs r2, r2, #0x10000
-
- sub lr, lr, #158
-
- add r0, r0, r1 ; move to next input line
-
- add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
- pld [r0, r11]
-
- bne first_pass_hloop_v6
-
-;second pass filter
-secondpass_filter
- ldr r3, [sp], #4 ; load back yoffset
- ldr r0, [sp, #216] ; load dst address from stack 180+36
- ldr r1, [sp, #220] ; load dst stride from stack 180+40
-
- cmp r3, #0
- beq skip_secondpass_filter
-
- adr r12, filter8_coeff
- add lr, r12, r3, lsl #4 ;calculate filter location
-
- mov r2, #0x00080000
-
- ldr r3, [lr] ; load up packed filter coefficients
- ldr r4, [lr, #4]
- ldr r5, [lr, #8]
-
- pkhbt r12, r4, r3 ; pack the filter differently
- pkhbt r11, r5, r4
-
-second_pass_hloop_v6
- ldr r6, [sp] ; load the data
- ldr r7, [sp, #4]
-
- orr r2, r2, #2 ; loop counter
-
-second_pass_wloop_v6
- smuad lr, r3, r6 ; apply filter
- smulbt r10, r3, r6
-
- ldr r8, [sp, #8]
-
- smlad lr, r4, r7, lr
- smladx r10, r12, r7, r10
-
- ldrh r9, [sp, #12]
-
- smlad lr, r5, r8, lr
- smladx r10, r11, r8, r10
-
- add sp, sp, #4
- smlatb r10, r5, r9, r10
-
- sub r2, r2, #1
-
- add lr, lr, #0x40 ; round_shift_and_clamp
- tst r2, #0xff
- usat lr, #8, lr, asr #7
- add r10, r10, #0x40
- strb lr, [r0], r1 ; the result is transposed back and stored
- usat r10, #8, r10, asr #7
-
- strb r10, [r0],r1
-
- movne r6, r7
- movne r7, r8
-
- bne second_pass_wloop_v6
-
- subs r2, r2, #0x10000
- add sp, sp, #12 ; updata src for next loop (20-8)
- sub r0, r0, r1, lsl #2
- add r0, r0, #1
-
- bne second_pass_hloop_v6
-
- add sp, sp, #20
- ldmia sp!, {r4 - r11, pc}
-
-;--------------------
-skip_firstpass_filter
- sub r0, r0, r1, lsl #1
- sub r1, r1, #8
- mov r2, #9
-
-skip_firstpass_hloop
- ldrb r4, [r0], #1 ; load data
- subs r2, r2, #1
- ldrb r5, [r0], #1
- strh r4, [lr], #20 ; store it to immediate buffer
- ldrb r6, [r0], #1 ; load data
- strh r5, [lr], #20
- ldrb r7, [r0], #1
- strh r6, [lr], #20
- ldrb r8, [r0], #1
- strh r7, [lr], #20
- ldrb r9, [r0], #1
- strh r8, [lr], #20
- ldrb r10, [r0], #1
- strh r9, [lr], #20
- ldrb r11, [r0], #1
- strh r10, [lr], #20
- add r0, r0, r1 ; move to next input line
- strh r11, [lr], #20
-
- sub lr, lr, #158 ; move over to next column
- bne skip_firstpass_hloop
-
- b secondpass_filter
-
-;--------------------
-skip_secondpass_filter
- mov r2, #8
- add sp, sp, #4 ;start from src[0] instead of src[-2]
-
-skip_secondpass_hloop
- ldr r6, [sp], #4
- subs r2, r2, #1
- ldr r8, [sp], #4
-
- mov r7, r6, lsr #16 ; unpack
- strb r6, [r0], r1
- mov r9, r8, lsr #16
- strb r7, [r0], r1
- add sp, sp, #12 ; 20-8
- strb r8, [r0], r1
- strb r9, [r0], r1
-
- sub r0, r0, r1, lsl #2
- add r0, r0, #1
-
- bne skip_secondpass_hloop
-
- add sp, sp, #16 ; 180 - (160 +4)
-
- ldmia sp!, {r4 - r11, pc}
-
- ENDP
-
-;-----------------
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter8_coeff
- DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
- DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
- DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
- DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
- DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
- DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
- DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
- DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
-
- ;DCD 0, 0, 128, 0, 0, 0
- ;DCD 0, -6, 123, 12, -1, 0
- ;DCD 2, -11, 108, 36, -8, 1
- ;DCD 0, -9, 93, 50, -6, 0
- ;DCD 3, -16, 77, 77, -16, 3
- ;DCD 0, -6, 50, 93, -9, 0
- ;DCD 1, -8, 36, 108, -11, 2
- ;DCD 0, -1, 12, 123, -6, 0
-
- END
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
-#include "bilinearfilter_arm.h"
-
-void vp9_filter_block2d_bil_armv6
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch,
- const short *HFilter,
- const short *VFilter,
- int Width,
- int Height
-) {
- unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
- /* then 1-D vertically... */
- vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-
-void vp9_bilinear_predict4x4_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-}
-
-void vp9_bilinear_predict16x16_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ /dev/null
@@ -1,35 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef BILINEARFILTER_ARM_H
-#define BILINEARFILTER_ARM_H
-
-extern void vp9_filter_block2d_bil_first_pass_armv6
-(
- const unsigned char *src_ptr,
- unsigned short *dst_ptr,
- unsigned int src_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp9_filter
-);
-
-extern void vp9_filter_block2d_bil_second_pass_armv6
-(
- const unsigned short *src_ptr,
- unsigned char *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp9_filter
-);
-
-#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ /dev/null
@@ -1,198 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <math.h>
-#include "vp8/common/filter.h"
-#include "vp8/common/subpixel.h"
-#include "vpx_ports/mem.h"
-
-extern void vp9_filter_block2d_first_pass_armv6
-(
- unsigned char *src_ptr,
- short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_width,
- unsigned int output_height,
- const short *vp9_filter
-);
-
-// 8x8
-extern void vp9_filter_block2d_first_pass_8x8_armv6
-(
- unsigned char *src_ptr,
- short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_width,
- unsigned int output_height,
- const short *vp9_filter
-);
-
-// 16x16
-extern void vp9_filter_block2d_first_pass_16x16_armv6
-(
- unsigned char *src_ptr,
- short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_width,
- unsigned int output_height,
- const short *vp9_filter
-);
-
-extern void vp9_filter_block2d_second_pass_armv6
-(
- short *src_ptr,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int cnt,
- const short *vp9_filter
-);
-
-extern void vp9_filter4_block2d_second_pass_armv6
-(
- short *src_ptr,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int cnt,
- const short *vp9_filter
-);
-
-extern void vp9_filter_block2d_first_pass_only_armv6
-(
- unsigned char *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int cnt,
- unsigned int output_pitch,
- const short *vp9_filter
-);
-
-
-extern void vp9_filter_block2d_second_pass_only_armv6
-(
- unsigned char *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int cnt,
- unsigned int output_pitch,
- const short *vp9_filter
-);
-
-#if HAVE_ARMV6
-void vp9_sixtap_predict_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
-
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- /* Vfilter is null. First pass only */
- if (xoffset && !yoffset) {
- /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
- vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
-
- vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
- } else {
- /* Vfilter is a 4 tap filter */
- if (yoffset & 0x1) {
- vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
- vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
- }
- /* Vfilter is 6 tap filter */
- else {
- vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
- vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
- }
- }
-}
-
-void vp9_sixtap_predict8x8_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- if (xoffset && !yoffset) {
- vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
- } else {
- if (yoffset & 0x1) {
- vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
- vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
- } else {
- vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
- vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
- }
- }
-}
-
-
-void vp9_sixtap_predict16x16_armv6
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */
-
- HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
- VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
-
- if (xoffset && !yoffset) {
- vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
- }
- /* Hfilter is null. Second pass only */
- else if (!xoffset && yoffset) {
- vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
- } else {
- if (yoffset & 0x1) {
- vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
- vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
- } else {
- vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
- vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
- }
- }
-
-}
-#endif
--- a/vp8/common/arm/idct_arm.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_ARM_H
-#define IDCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_idct(vp9_short_idct4x4llm_1_v6);
-extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
-extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
-
-#undef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
-
-#undef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
-
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
-
-#undef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_idct(vp9_short_idct4x4llm_1_neon);
-extern prototype_idct(vp9_short_idct4x4llm_neon);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
-extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
-
-#undef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
-
-#undef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
-
-#undef vp8_idct_iwalsh1
-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
-
-#undef vp8_idct_iwalsh16
-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/loopfilter_arm.c
+++ /dev/null
@@ -1,166 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/onyxc_int.h"
-
-#if HAVE_ARMV6
-extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
-extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
-#endif
-
-#if HAVE_ARMV7
-typedef void loopfilter_y_neon(unsigned char *src, int pitch,
- unsigned char blimit, unsigned char limit, unsigned char thresh);
-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
- unsigned char blimit, unsigned char limit, unsigned char thresh,
- unsigned char *v);
-
-extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
-extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
-
-extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
-extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
-#endif
-
-#if HAVE_ARMV6
-/*ARMV6 loopfilter functions*/
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
-#if HAVE_ARMV7
-/* NEON loopfilter functions */
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- unsigned char mblim = *lfi->mblim;
- unsigned char lim = *lfi->lim;
- unsigned char hev_thr = *lfi->hev_thr;
- vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
-
- if (u_ptr)
- vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- unsigned char mblim = *lfi->mblim;
- unsigned char lim = *lfi->lim;
- unsigned char hev_thr = *lfi->hev_thr;
-
- vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
-
- if (u_ptr)
- vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- unsigned char blim = *lfi->blim;
- unsigned char lim = *lfi->lim;
- unsigned char hev_thr = *lfi->hev_thr;
-
- vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
- vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
- vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
-
- if (u_ptr)
- vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- unsigned char blim = *lfi->blim;
- unsigned char lim = *lfi->lim;
- unsigned char hev_thr = *lfi->hev_thr;
-
- vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
- vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
- vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
-
- if (u_ptr)
- vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
-}
-#endif
--- a/vp8/common/arm/loopfilter_arm.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LOOPFILTER_ARM_H
-#define LOOPFILTER_ARM_H
-
-#include "vpx_config.h"
-
-#if HAVE_ARMV6
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
-
-#endif /* HAVE_ARMV7 */
-
-#endif /* LOOPFILTER_ARM_H */
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ /dev/null
@@ -1,357 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict16x16_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_bilinear_predict16x16_neon| PROC
- push {r4-r5, lr}
-
- adr r12, bifilter16_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r4], r5 ;store result
- vst1.u8 {d4, d5}, [r4], r5
- vst1.u8 {d6, d7}, [r4], r5
- vmov q11, q15
- vst1.u8 {d8, d9}, [r4], r5
-
- bne filt_blk2d_sp16x16_loop_neon
-
- add sp, sp, #272
-
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r4], r5 ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r4], r5
- vst1.u8 {d18, d19}, [r4], r5
- vst1.u8 {d20, d21}, [r4], r5
-
- bne filt_blk2d_fpo16x16_loop_neon
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r4], r5 ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r4], r5
- vmov q11, q15
- vst1.u8 {d6, d7}, [r4], r5
- vst1.u8 {d8, d9}, [r4], r5
-
- bne filt_blk2d_spo16x16_loop_neon
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
-bifilter16_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ /dev/null
@@ -1,130 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict4x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict4x4_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x4)
- vld1.u8 {d2}, [r0], r1 ;load src data
- add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
-
- vld1.u8 {d3}, [r0], r1
- vld1.u32 {d31}, [r2] ;first_pass filter
-
- vld1.u8 {d4}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
- vld1.u8 {d5}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {d6}, [r0], r1
-
- vshr.u64 q4, q1, #8 ;construct src_ptr[1]
- vshr.u64 q5, q2, #8
- vshr.u64 d12, d6, #8
-
- vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d4, d5
- vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q8, d4, d0
- vmull.u8 q9, d6, d0
-
- vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1])
- vmlal.u8 q8, d10, d1
- vmlal.u8 q9, d12, d1
-
- vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d29, q8, #7
- vqrshrn.u16 d30, q9, #7
-
-;Second pass: 4x4
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3 ;calculate Vfilter location
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d28, d0
- vmull.u8 q2, d29, d0
-
- vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
- vext.8 d27, d29, d30, #4
-
- vmlal.u8 q1, d26, d1
- vmlal.u8 q2, d27, d1
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
-
- vst1.32 {d2[0]}, [r4] ;store result
- vst1.32 {d2[1]}, [r0]
- vst1.32 {d3[0]}, [r1]
- vst1.32 {d3[1]}, [r2]
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
-
- vld1.32 {d28[0]}, [r0], r1 ;load src data
- vld1.32 {d28[1]}, [r0], r1
- vld1.32 {d29[0]}, [r0], r1
- vld1.32 {d29[1]}, [r0], r1
- vld1.32 {d30[0]}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.32 {d28[0]}, [r4], lr ;store result
- vst1.32 {d28[1]}, [r4], lr
- vst1.32 {d29[0]}, [r4], lr
- vst1.32 {d29[1]}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter4_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ /dev/null
@@ -1,135 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict8x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict8x4_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter8x4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (5x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vld1.u8 {q5}, [r0], r1
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q7, #7
- vqrshrn.u16 d24, q8, #7
- vqrshrn.u16 d25, q9, #7
- vqrshrn.u16 d26, q10, #7
-
-;Second pass: 4x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3
- add r0, r4, lr
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
- add r1, r0, lr
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
-
- add r2, r1, lr
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
-
- vst1.u8 {d2}, [r4] ;store result
- vst1.u8 {d3}, [r0]
- vst1.u8 {d4}, [r1]
- vst1.u8 {d5}, [r2]
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.u8 {d22}, [r4], lr ;store result
- vst1.u8 {d23}, [r4], lr
- vst1.u8 {d24}, [r4], lr
- vst1.u8 {d25}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter8x4_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ /dev/null
@@ -1,183 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_bilinear_predict8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_bilinear_predict8x8_neon| PROC
- push {r4, lr}
-
- adr r12, bifilter8_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq skip_secondpass_filter
-
- add r3, r12, r3, lsl #3
- add r0, r4, lr
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
- add r1, r0, lr
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2}, [r4] ;store result
- vst1.u8 {d3}, [r0]
- vst1.u8 {d4}, [r1], lr
- vst1.u8 {d5}, [r1], lr
- vst1.u8 {d6}, [r1], lr
- vst1.u8 {d7}, [r1], lr
- vst1.u8 {d8}, [r1], lr
- vst1.u8 {d9}, [r1], lr
-
- pop {r4, pc}
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;---------------------
-skip_secondpass_filter
- vst1.u8 {d22}, [r4], lr ;store result
- vst1.u8 {d23}, [r4], lr
- vst1.u8 {d24}, [r4], lr
- vst1.u8 {d25}, [r4], lr
- vst1.u8 {d26}, [r4], lr
- vst1.u8 {d27}, [r4], lr
- vst1.u8 {d28}, [r4], lr
- vst1.u8 {d29}, [r4], lr
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
-bifilter8_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null
@@ -1,584 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_build_intra_predictors_mby_neon_func|
- EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
- push {r4-r8, lr}
-
- cmp r3, #0
- beq case_dc_pred
- cmp r3, #1
- beq case_v_pred
- cmp r3, #2
- beq case_h_pred
- cmp r3, #3
- beq case_tm_pred
-
-case_dc_pred
- ldr r4, [sp, #24] ; Up
- ldr r5, [sp, #28] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up
-
- cmp r5, #0
- beq skip_dc_pred_left
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
-
- pop {r4-r8,pc}
-case_v_pred
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- pop {r4-r8,pc}
-
-case_h_pred
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- pop {r4-r8,pc}
-
-case_tm_pred
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- subs r12, r12, #1
- bne case_tm_pred_loop
-
- pop {r4-r8,pc}
-
- ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
- push {r4-r8, lr}
-
- mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
- cmp r3, #0
- beq case_dc_pred_s
- cmp r3, #1
- beq case_v_pred_s
- cmp r3, #2
- beq case_h_pred_s
- cmp r3, #3
- beq case_tm_pred_s
-
-case_dc_pred_s
- ldr r4, [sp, #24] ; Up
- ldr r5, [sp, #28] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left_s
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up_s
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up_s
-
- cmp r5, #0
- beq skip_dc_pred_left_s
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left_s
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left_s
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
-
- pop {r4-r8,pc}
-case_v_pred_s
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- pop {r4-r8,pc}
-
-case_h_pred_s
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- pop {r4-r8,pc}
-
-case_tm_pred_s
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop_s
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- subs r12, r12, #1
- bne case_tm_pred_loop_s
-
- pop {r4-r8,pc}
-
- ENDP
-
-
- END
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem16x16_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem16x16_neon| PROC
-
- vld1.u8 {q0}, [r0], r1
- vld1.u8 {q1}, [r0], r1
- vld1.u8 {q2}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- vld1.u8 {q3}, [r0], r1
- vst1.u8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {q2}, [r2], r3
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {q4}, [r2], r3
- vld1.u8 {q7}, [r0], r1
- vst1.u8 {q5}, [r2], r3
- vld1.u8 {q8}, [r0], r1
- vst1.u8 {q6}, [r2], r3
- vld1.u8 {q9}, [r0], r1
- vst1.u8 {q7}, [r2], r3
- vld1.u8 {q10}, [r0], r1
- vst1.u8 {q8}, [r2], r3
- vld1.u8 {q11}, [r0], r1
- vst1.u8 {q9}, [r2], r3
- vld1.u8 {q12}, [r0], r1
- vst1.u8 {q10}, [r2], r3
- vld1.u8 {q13}, [r0], r1
- vst1.u8 {q11}, [r2], r3
- vld1.u8 {q14}, [r0], r1
- vst1.u8 {q12}, [r2], r3
- vld1.u8 {q15}, [r0], r1
- vst1.u8 {q13}, [r2], r3
- vst1.u8 {q14}, [r2], r3
- vst1.u8 {q15}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp9_copy_mem16x16_neon|
-
- END
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem8x4_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x4_neon| PROC
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vst1.u8 {d3}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp9_copy_mem8x4_neon|
-
- END
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_copy_mem8x8_neon|
- ; ARM
- ; REQUIRE8
- ; PRESERVE8
-
- AREA Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp9_copy_mem8x8_neon| PROC
-
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r0], r1
- vst1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r0], r1
- vst1.u8 {d2}, [r2], r3
- vld1.u8 {d4}, [r0], r1
- vst1.u8 {d3}, [r2], r3
- vld1.u8 {d5}, [r0], r1
- vst1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r0], r1
- vst1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r0], r1
- vst1.u8 {d6}, [r2], r3
- vst1.u8 {d7}, [r2], r3
-
- mov pc, lr
-
- ENDP ; |vp9_copy_mem8x8_neon|
-
- END
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,49 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dc_only_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
-; unsigned char *dst_ptr, int pitch, int stride)
-; r0 input_dc
-; r1 pred_ptr
-; r2 dst_ptr
-; r3 pitch
-; sp stride
-|vp8_dc_only_idct_add_neon| PROC
- add r0, r0, #4
- asr r0, r0, #3
- ldr r12, [sp]
- vdup.16 q0, r0
-
- vld1.32 {d2[0]}, [r1], r3
- vld1.32 {d2[1]}, [r1], r3
- vld1.32 {d4[0]}, [r1], r3
- vld1.32 {d4[1]}, [r1]
-
- vaddw.u8 q1, q0, d2
- vaddw.u8 q2, q0, d4
-
- vqmovun.s16 d2, q1
- vqmovun.s16 d4, q2
-
- vst1.32 {d2[0]}, [r2], r12
- vst1.32 {d2[1]}, [r2], r12
- vst1.32 {d4[0]}, [r2], r12
- vst1.32 {d4[1]}, [r2]
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ /dev/null
@@ -1,80 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
- EXPORT |vp8_short_inv_walsh4x4_neon|
- EXPORT |vp8_short_inv_walsh4x4_1_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_neon| PROC
-
- ; read in all four lines of values: d0->d3
- vld1.i16 {q0-q1}, [r0@128]
-
- ; first for loop
- vadd.s16 d4, d0, d3 ;a = [0] + [12]
- vadd.s16 d6, d1, d2 ;b = [4] + [8]
- vsub.s16 d5, d0, d3 ;d = [0] - [12]
- vsub.s16 d7, d1, d2 ;c = [4] - [8]
-
- vadd.s16 q0, q2, q3 ; a+b d+c
- vsub.s16 q1, q2, q3 ; a-b d-c
-
- vtrn.32 d0, d2 ;d0: 0 1 8 9
- ;d2: 2 3 10 11
- vtrn.32 d1, d3 ;d1: 4 5 12 13
- ;d3: 6 7 14 15
-
- vtrn.16 d0, d1 ;d0: 0 4 8 12
- ;d1: 1 5 9 13
- vtrn.16 d2, d3 ;d2: 2 6 10 14
- ;d3: 3 7 11 15
-
- ; second for loop
-
- vadd.s16 d4, d0, d3 ;a = [0] + [3]
- vadd.s16 d6, d1, d2 ;b = [1] + [2]
- vsub.s16 d5, d0, d3 ;d = [0] - [3]
- vsub.s16 d7, d1, d2 ;c = [1] - [2]
-
- vmov.i16 q8, #3
-
- vadd.s16 q0, q2, q3 ; a+b d+c
- vsub.s16 q1, q2, q3 ; a-b d-c
-
- vadd.i16 q0, q0, q8 ;e/f += 3
- vadd.i16 q1, q1, q8 ;g/h += 3
-
- vshr.s16 q0, q0, #3 ;e/f >> 3
- vshr.s16 q1, q1, #3 ;g/h >> 3
-
- vst4.i16 {d0,d1,d2,d3}, [r1@128]
-
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_neon|
-
-
-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
-|vp8_short_inv_walsh4x4_1_neon| PROC
- ldrsh r2, [r0] ; load input[0]
- add r3, r2, #3 ; add 3
- add r2, r1, #16 ; base for last 8 output
- asr r0, r3, #3 ; right shift 3
- vdup.16 q0, r0 ; load and duplicate
- vst1.16 {q0}, [r1@128] ; write back 8
- vst1.16 {q0}, [r2@128] ; write back last 8
- bx lr
- ENDP ; |vp8_short_inv_walsh4x4_1_neon|
-
- END
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,397 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_loop_filter_horizontal_edge_y_neon|
- EXPORT |vp9_loop_filter_horizontal_edge_uv_neon|
- EXPORT |vp9_loop_filter_vertical_edge_y_neon|
- EXPORT |vp9_loop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp9_loop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r3, [sp, #4] ; load thresh
- add r12, r2, r1
- add r1, r1, r1
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vld1.u8 {q3}, [r2@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r2@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r2@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r2@128] ; q2
- vld1.u8 {q10}, [r12@128] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r12, r12, r1, lsl #1
-
- bl vp9_loop_filter_neon
-
- vst1.u8 {q5}, [r2@128], r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r2@128], r1 ; store oq0
- vst1.u8 {q8}, [r12@128], r1 ; store oq1
-
- pop {pc}
- ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
-
-
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp9_loop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- ldr r12, [sp, #4] ; load thresh
- ldr r2, [sp, #8] ; load v ptr
- vdup.u8 q2, r12 ; duplicate thresh
-
- sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
- sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r3@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r3@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r3@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r3@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r3@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r3@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r3@64] ; q3
- vld1.u8 {d21}, [r12@64] ; q3
-
- bl vp9_loop_filter_neon
-
- sub r0, r0, r1, lsl #1
- sub r2, r2, r1, lsl #1
-
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r2@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r2@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r2@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64] ; store u oq1
- vst1.u8 {d17}, [r2@64] ; store v oq1
-
- pop {pc}
- ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon|
-
-; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-
-|vp9_loop_filter_vertical_edge_y_neon| PROC
- push {lr}
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, #4 ; src ptr down by 4 columns
- add r1, r1, r1
- ldr r3, [sp, #4] ; load thresh
- add r12, r2, r1, asr #1
-
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d10}, [r2], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d14}, [r2], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d18}, [r2], r1
- vld1.u8 {d20}, [r12], r1
-
- vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d11}, [r2], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d15}, [r2], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d19}, [r2]
- vld1.u8 {d21}, [r12]
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp9_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
-
- sub r0, r0, #2 ; dst ptr
-
- vswp d14, d12
- vswp d16, d15
-
- add r12, r0, r1, asr #1
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
- pop {pc}
- ENDP ; |vp9_loop_filter_vertical_edge_y_neon|
-
-; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp9_loop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- vdup.u8 q0, r2 ; duplicate blimit
- sub r12, r0, #4 ; move u pointer down by 4 columns
- ldr r2, [sp, #8] ; load v ptr
- vdup.u8 q1, r3 ; duplicate limit
- sub r3, r2, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r12], r1 ;load u data
- vld1.u8 {d7}, [r3], r1 ;load v data
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d9}, [r3], r1
- vld1.u8 {d10}, [r12], r1
- vld1.u8 {d11}, [r3], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d13}, [r3], r1
- vld1.u8 {d14}, [r12], r1
- vld1.u8 {d15}, [r3], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r12], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r12]
- vld1.u8 {d21}, [r3]
-
- ldr r12, [sp, #4] ; load thresh
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r12 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp9_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
- vswp d14, d12
- vswp d16, d15
-
- sub r0, r0, #2
- sub r2, r2, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
- pop {pc}
- ENDP ; |vp9_loop_filter_vertical_edge_uv_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0 flimit
-; q1 limit
-; q2 thresh
-; q3 p3
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3
-|vp9_loop_filter_neon| PROC
-
- ; vp9_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q3, q9, q8 ; abs(q2 - q1)
- vabd.u8 q4, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q3, q3, q4
- vmax.u8 q15, q11, q12
-
- vabd.u8 q9, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 q15, q15, q3
-
- vmov.u8 q10, #0x80 ; 0x80
-
- vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
-
- vcge.u8 q15, q1, q15
-
- ; vp9_filter() function
- ; convert to signed
- veor q7, q7, q10 ; qs0
- vshr.u8 q2, q2, #1 ; a = a / 2
- veor q6, q6, q10 ; ps0
-
- veor q5, q5, q10 ; ps1
- vqadd.u8 q9, q9, q2 ; a = b + a
-
- veor q8, q8, q10 ; qs1
-
- vmov.u8 q10, #3 ; #3
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q11, d15, d13
-
- vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
-
- vmovl.u8 q4, d20
-
- vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
- vorr q14, q13, q14 ; vp8_hevmask
-
- vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
- vmul.i16 q11, q11, q4
-
- vand q1, q1, q14 ; vp9_filter &= hev
- vand q15, q15, q9 ; vp9_filter_mask
-
- vaddw.s8 q2, q2, d2
- vaddw.s8 q11, q11, d3
-
- vmov.u8 q9, #4 ; #4
-
- ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q11
- vand q1, q1, q15 ; vp9_filter &= mask
-
- vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3)
- vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q1, q1, #3 ; Filter1 >>= 3
-
-
- vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
-
- ; outer tap adjustments: ++vp9_filter >> 1
- vrshr.s8 q1, q1, #1
- vbic q1, q1, q14 ; vp9_filter &= ~hev
- vmov.u8 q0, #0x80 ; 0x80
- vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter)
- vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- veor q5, q13, q0 ; *op1 = u^0x80
- veor q8, q12, q0 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
- END
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ /dev/null
@@ -1,117 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon|
- EXPORT |vp9_loop_filter_bhs_neon|
- EXPORT |vp9_loop_filter_mbhs_neon|
- ARM
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *s, PRESERVE
-; r1 int p, PRESERVE
-; q1 limit, PRESERVE
-
-|vp9_loop_filter_simple_horizontal_edge_neon| PROC
-
- sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
-
- vld1.u8 {q7}, [r0@128], r1 ; q0
- vld1.u8 {q5}, [r3@128], r1 ; p0
- vld1.u8 {q8}, [r0@128] ; q1
- vld1.u8 {q6}, [r3@128] ; p1
-
- vabd.u8 q15, q6, q7 ; abs(p0 - q0)
- vabd.u8 q14, q5, q8 ; abs(p1 - q1)
-
- vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
- vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
- vmov.u8 q0, #0x80 ; 0x80
- vmov.s16 q13, #3
- vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
- veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
- veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
- veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
-
- vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q3, d15, d13
-
- vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
-
- vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
- vmul.s16 q3, q3, q13
-
- vmov.u8 q10, #0x03 ; 0x03
- vmov.u8 q9, #0x04 ; 0x04
-
- vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q3, q3, d9
-
- vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d9, q3
-
- vand q14, q4, q15 ; vp9_filter &= mask
-
- vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
- vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q4, q3, #3 ; Filter1 >>= 3
-
- sub r0, r0, r1
-
- ;calculate output
- vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
-
- vst1.u8 {q6}, [r3@128] ; store op0
- vst1.u8 {q7}, [r0@128] ; store oq0
-
- bx lr
- ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp9_loop_filter_bhs_neon| PROC
- push {r4, lr}
- ldrb r3, [r2] ; load blim from mem
- vdup.s8 q1, r3 ; duplicate blim
-
- add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
- bl vp9_loop_filter_simple_horizontal_edge_neon
- ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
- add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
- bl vp9_loop_filter_simple_horizontal_edge_neon
- add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
- pop {r4, lr}
- b vp9_loop_filter_simple_horizontal_edge_neon
- ENDP ;|vp9_loop_filter_bhs_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp9_loop_filter_mbhs_neon| PROC
- ldrb r3, [r2] ; load blim from mem
- vdup.s8 q1, r3 ; duplicate mblim
- b vp9_loop_filter_simple_horizontal_edge_neon
- ENDP ;|vp9_loop_filter_bhs_neon|
-
- END
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null
@@ -1,154 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ;EXPORT |vp9_loop_filter_simple_vertical_edge_neon|
- EXPORT |vp9_loop_filter_bvs_neon|
- EXPORT |vp9_loop_filter_mbvs_neon|
- ARM
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *s, PRESERVE
-; r1 int p, PRESERVE
-; q1 limit, PRESERVE
-
-|vp9_loop_filter_simple_vertical_edge_neon| PROC
- sub r0, r0, #2 ; move src pointer down by 2 columns
- add r12, r1, r1
- add r3, r0, r1
-
- vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
- vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
- vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
- vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
- vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
- vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
- vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
- vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
- vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
- vswp d7, d10
- vswp d12, d9
-
- ;vp9_filter_mask() function
- ;vp8_hevmask() function
- sub r0, r0, r1, lsl #4
- vabd.u8 q15, q5, q4 ; abs(p0 - q0)
- vabd.u8 q14, q3, q6 ; abs(p1 - q1)
-
- vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
- vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
- vmov.u8 q0, #0x80 ; 0x80
- vmov.s16 q11, #3
- vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
- veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
- veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
- veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
-
- vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
- vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
- vsubl.s8 q13, d9, d11
-
- vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
-
- vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
- vmul.s16 q13, q13, q11
-
- vmov.u8 q11, #0x03 ; 0x03
- vmov.u8 q12, #0x04 ; 0x04
-
- vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d29
-
- vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d29, q13
-
- add r0, r0, #1
- add r3, r0, r1
-
- vand q14, q14, q15 ; vp9_filter &= mask
-
- vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
- vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q14, q3, #3 ; Filter1 >>= 3
-
- ;calculate output
- vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
- vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- add r12, r1, r1
- vswp d13, d14
-
- ;store op1, op0, oq0, oq1
- vst2.8 {d12[0], d13[0]}, [r0], r12
- vst2.8 {d12[1], d13[1]}, [r3], r12
- vst2.8 {d12[2], d13[2]}, [r0], r12
- vst2.8 {d12[3], d13[3]}, [r3], r12
- vst2.8 {d12[4], d13[4]}, [r0], r12
- vst2.8 {d12[5], d13[5]}, [r3], r12
- vst2.8 {d12[6], d13[6]}, [r0], r12
- vst2.8 {d12[7], d13[7]}, [r3], r12
- vst2.8 {d14[0], d15[0]}, [r0], r12
- vst2.8 {d14[1], d15[1]}, [r3], r12
- vst2.8 {d14[2], d15[2]}, [r0], r12
- vst2.8 {d14[3], d15[3]}, [r3], r12
- vst2.8 {d14[4], d15[4]}, [r0], r12
- vst2.8 {d14[5], d15[5]}, [r3], r12
- vst2.8 {d14[6], d15[6]}, [r0], r12
- vst2.8 {d14[7], d15[7]}, [r3]
-
- bx lr
- ENDP ; |vp9_loop_filter_simple_vertical_edge_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp9_loop_filter_bvs_neon| PROC
- push {r4, lr}
- ldrb r3, [r2] ; load blim from mem
- mov r4, r0
- add r0, r0, #4
- vdup.s8 q1, r3 ; duplicate blim
- bl vp9_loop_filter_simple_vertical_edge_neon
- ; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1
- add r0, r4, #8
- bl vp9_loop_filter_simple_vertical_edge_neon
- add r0, r4, #12
- pop {r4, lr}
- b vp9_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp9_loop_filter_bvs_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp9_loop_filter_mbvs_neon| PROC
- ldrb r3, [r2] ; load mblim from mem
- vdup.s8 q1, r3 ; duplicate mblim
- b vp9_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp9_loop_filter_bvs_neon|
- END
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ /dev/null
@@ -1,469 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
- EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
- EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
- EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh)
-; r0 unsigned char *src,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- add r1, r1, r1 ; double stride
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
- vdup.u8 q2, r12 ; thresh
- add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
-
- vld1.u8 {q3}, [r0@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r0@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r0@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r0@128], r1 ; q2
- vld1.u8 {q10}, [r12@128], r1 ; q3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #2
- add r0, r12, r1, lsr #1
-
- vst1.u8 {q4}, [r12@128],r1 ; store op2
- vst1.u8 {q5}, [r0@128],r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r0@128],r1 ; store oq0
- vst1.u8 {q8}, [r12@128] ; store oq1
- vst1.u8 {q9}, [r0@128] ; store oq2
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
- vdup.u8 q2, r12 ; thresh
- ldr r12, [sp, #8] ; load v ptr
- sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r0@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r0@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r0@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r0@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r0@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r0@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r0@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r0@64], r1 ; q3
- vld1.u8 {d21}, [r12@64], r1 ; q3
-
- bl vp8_mbloop_filter_neon
-
- sub r0, r0, r1, lsl #3
- sub r12, r12, r1, lsl #3
-
- add r0, r0, r1
- add r12, r12, r1
-
- vst1.u8 {d8}, [r0@64], r1 ; store u op2
- vst1.u8 {d9}, [r12@64], r1 ; store v op2
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r12@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r12@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r12@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64], r1 ; store u oq1
- vst1.u8 {d17}, [r12@64], r1 ; store v oq1
- vst1.u8 {d18}, [r0@64], r1 ; store u oq2
- vst1.u8 {d19}, [r12@64], r1 ; store v oq2
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh)
-; r0 unsigned char *src,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, #4 ; move src pointer down by 4 columns
- vdup.s8 q2, r12 ; thresh
- add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
-
- vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
- vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
- vld1.u8 {d8}, [r0], r1
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d10}, [r0], r1
- vld1.u8 {d11}, [r12], r1
- vld1.u8 {d12}, [r0], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d14}, [r0], r1
- vld1.u8 {d15}, [r12], r1
- vld1.u8 {d16}, [r0], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d18}, [r0], r1
- vld1.u8 {d19}, [r12], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r12], r1
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- sub r0, r0, r1, lsl #3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #3
-
- ;transpose to 16x8 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- ;store op2, op1, op0, oq0, oq1, oq2
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r12], r1
- vst1.8 {d8}, [r0], r1
- vst1.8 {d9}, [r12], r1
- vst1.8 {d10}, [r0], r1
- vst1.8 {d11}, [r12], r1
- vst1.8 {d12}, [r0], r1
- vst1.8 {d13}, [r12], r1
- vst1.8 {d14}, [r0], r1
- vst1.8 {d15}, [r12], r1
- vst1.8 {d16}, [r0], r1
- vst1.8 {d17}, [r12], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d19}, [r12], r1
- vst1.8 {d20}, [r0]
- vst1.8 {d21}, [r12]
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-; const unsigned char *blimit,
-; const unsigned char *limit,
-; const unsigned char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 const signed char *flimit,
-; r3 const signed char *limit,
-; sp const signed char *thresh,
-; sp+4 unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- ldr r12, [sp, #4] ; load thresh
- sub r0, r0, #4 ; move u pointer down by 4 columns
- vdup.u8 q2, r12 ; thresh
- ldr r12, [sp, #8] ; load v ptr
- sub r12, r12, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r0], r1 ;load u data
- vld1.u8 {d7}, [r12], r1 ;load v data
- vld1.u8 {d8}, [r0], r1
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d10}, [r0], r1
- vld1.u8 {d11}, [r12], r1
- vld1.u8 {d12}, [r0], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d14}, [r0], r1
- vld1.u8 {d15}, [r12], r1
- vld1.u8 {d16}, [r0], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d18}, [r0], r1
- vld1.u8 {d19}, [r12], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r12], r1
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- sub r0, r0, r1, lsl #3
-
- bl vp8_mbloop_filter_neon
-
- sub r12, r12, r1, lsl #3
-
- ;transpose to 16x8 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- ;store op2, op1, op0, oq0, oq1, oq2
- vst1.8 {d6}, [r0], r1
- vst1.8 {d7}, [r12], r1
- vst1.8 {d8}, [r0], r1
- vst1.8 {d9}, [r12], r1
- vst1.8 {d10}, [r0], r1
- vst1.8 {d11}, [r12], r1
- vst1.8 {d12}, [r0], r1
- vst1.8 {d13}, [r12], r1
- vst1.8 {d14}, [r0], r1
- vst1.8 {d15}, [r12], r1
- vst1.8 {d16}, [r0], r1
- vst1.8 {d17}, [r12], r1
- vst1.8 {d18}, [r0], r1
- vst1.8 {d19}, [r12], r1
- vst1.8 {d20}, [r0]
- vst1.8 {d21}, [r12]
-
- pop {pc}
- ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-; void vp8_mbloop_filter_neon()
-; This is a helper function for the macroblock loopfilters. The individual
-; functions do the necessary load, transpose (if necessary), preserve (if
-; necessary) and store.
-
-; r0,r1 PRESERVE
-; r2 mblimit
-; r3 limit
-
-; q2 thresh
-; q3 p3 PRESERVE
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3 PRESERVE
-
-|vp8_mbloop_filter_neon| PROC
-
- ; vp9_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q1, q9, q8 ; abs(q2 - q1)
- vabd.u8 q0, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q1, q1, q0
- vmax.u8 q15, q11, q12
-
- vabd.u8 q12, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
- vmax.u8 q15, q15, q1
-
- vdup.u8 q1, r3 ; limit
- vdup.u8 q2, r2 ; mblimit
-
- vmov.u8 q0, #0x80 ; 0x80
-
- vcge.u8 q15, q1, q15
-
- vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
- vmov.u16 q11, #3 ; #3
-
- ; vp9_filter
- ; convert to signed
- veor q7, q7, q0 ; qs0
- vshr.u8 q1, q1, #1 ; a = a / 2
- veor q6, q6, q0 ; ps0
- veor q5, q5, q0 ; ps1
-
- vqadd.u8 q12, q12, q1 ; a = b + a
-
- veor q8, q8, q0 ; qs1
- veor q4, q4, q0 ; ps2
- veor q9, q9, q0 ; qs2
-
- vorr q14, q13, q14 ; vp8_hevmask
-
- vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
-
- vsubl.s8 q2, d14, d12 ; qs0 - ps0
- vsubl.s8 q13, d15, d13
-
- vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
-
- vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
-
- vand q15, q15, q12 ; vp9_filter_mask
-
- vmul.i16 q13, q13, q11
-
- vmov.u8 q12, #3 ; #3
-
- vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d3
-
- vmov.u8 q11, #4 ; #4
-
- ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q13
-
- vand q1, q1, q15 ; vp9_filter &= mask
-
- vmov.u16 q15, #63 ; #63
-
- vand q13, q1, q14 ; Filter2 &= hev
-
- vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
- vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
-
- vmov q0, q15
-
- vshr.s8 q2, q2, #3 ; Filter1 >>= 3
- vshr.s8 q13, q13, #3 ; Filter2 >>= 3
-
- vmov q11, q15
- vmov q12, q15
-
- vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
-
- vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
-
- vbic q1, q1, q14 ; vp9_filter &= ~hev
-
- ; roughly 1/7th difference across boundary
- ; roughly 2/7th difference across boundary
- ; roughly 3/7th difference across boundary
-
- vmov.u8 d5, #9 ; #9
- vmov.u8 d4, #18 ; #18
-
- vmov q13, q15
- vmov q14, q15
-
- vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
- vmlal.s8 q11, d3, d5
- vmov.u8 d5, #27 ; #27
- vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
- vmlal.s8 q13, d3, d4
- vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
- vmlal.s8 q15, d3, d5
-
- vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
- vqshrn.s16 d1, q11, #7
- vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
- vqshrn.s16 d25, q13, #7
- vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
- vqshrn.s16 d29, q15, #7
-
- vmov.u8 q1, #0x80 ; 0x80
-
- vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
- vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
- vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
- vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
- vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
- vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
-
- veor q9, q11, q1 ; *oq2 = s^0x80
- veor q4, q0, q1 ; *op2 = s^0x80
- veor q8, q13, q1 ; *oq1 = s^0x80
- veor q5, q12, q1 ; *op2 = s^0x80
- veor q7, q15, q1 ; *oq0 = s^0x80
- veor q6, q14, q1 ; *op0 = s^0x80
-
- bx lr
- ENDP ; |vp8_mbloop_filter_neon|
-
-;-----------------
-
- END
--- a/vp8/common/arm/neon/recon16x16mb_neon.asm
+++ /dev/null
@@ -1,131 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon16x16mb_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int ystride,
-; stack unsigned char *udst_ptr,
-; stack unsigned char *vdst_ptr
-
-|vp8_recon16x16mb_neon| PROC
- mov r12, #4 ;loop counter for Y loop
-
-recon16x16mb_loop_y
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]!
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]!
-
- pld [r0]
- pld [r1]
- pld [r1, #64]
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
- vadd.s16 q7, q7, q15
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vqmovun.s16 d4, q4
- vqmovun.s16 d5, q5
- vst1.u8 {q0}, [r2], r3 ;store result
- vqmovun.s16 d6, q6
- vst1.u8 {q1}, [r2], r3
- vqmovun.s16 d7, q7
- vst1.u8 {q2}, [r2], r3
- subs r12, r12, #1
-
- moveq r12, #2 ;loop counter for UV loop
-
- vst1.u8 {q3}, [r2], r3
- bne recon16x16mb_loop_y
-
- mov r3, r3, lsr #1 ;uv_stride = ystride>>1
- ldr r2, [sp] ;load upred_ptr
-
-recon16x16mb_loop_uv
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]!
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]!
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vadd.s16 q7, q7, q15
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vst1.u8 {d0}, [r2], r3 ;store result
- vqmovun.s16 d4, q4
- vst1.u8 {d1}, [r2], r3
- vqmovun.s16 d5, q5
- vst1.u8 {d2}, [r2], r3
- vqmovun.s16 d6, q6
- vst1.u8 {d3}, [r2], r3
- vqmovun.s16 d7, q7
- vst1.u8 {d4}, [r2], r3
- subs r12, r12, #1
-
- vst1.u8 {d5}, [r2], r3
- vst1.u8 {d6}, [r2], r3
- vst1.u8 {d7}, [r2], r3
-
- ldrne r2, [sp, #4] ;load vpred_ptr
- bne recon16x16mb_loop_uv
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/recon2b_neon.asm
+++ /dev/null
@@ -1,54 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon2b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon2b_neon| PROC
- vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
- vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
-
- vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
- vld1.16 {q6, q7}, [r1]!
- vmovl.u8 q1, d17
- vmovl.u8 q2, d18
- vmovl.u8 q3, d19
-
- vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q5
- vadd.s16 q2, q2, q6
- vadd.s16 q3, q3, q7
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- add r0, r2, r3
-
- vst1.u8 {d0}, [r2] ;store result
- vst1.u8 {d1}, [r0], r3
- add r2, r0, r3
- vst1.u8 {d2}, [r0]
- vst1.u8 {d3}, [r2], r3
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/recon4b_neon.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon4b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon4b_neon| PROC
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
- vadd.s16 q7, q7, q15
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vqmovun.s16 d4, q4
- vqmovun.s16 d5, q5
- vqmovun.s16 d6, q6
- vqmovun.s16 d7, q7
- add r0, r2, r3
-
- vst1.u8 {q0}, [r2] ;store result
- vst1.u8 {q1}, [r0], r3
- add r2, r0, r3
- vst1.u8 {q2}, [r0]
- vst1.u8 {q3}, [r2], r3
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/recon_neon.c
+++ /dev/null
@@ -1,29 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/blockd.h"
-
-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
-
-void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
- unsigned char *pred_ptr = &xd->predictor[0];
- short *diff_ptr = &xd->diff[0];
- unsigned char *dst_ptr = xd->dst.y_buffer;
- unsigned char *udst_ptr = xd->dst.u_buffer;
- unsigned char *vdst_ptr = xd->dst.v_buffer;
- int ystride = xd->dst.y_stride;
- /*int uv_stride = xd->dst.uv_stride;*/
-
- vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
- udst_ptr, vdst_ptr);
-}
--- a/vp8/common/arm/neon/reconb_neon.asm
+++ /dev/null
@@ -1,61 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon_b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon_b_neon| PROC
- mov r12, #16
-
- vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
- vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
- vld1.u8 {d29}, [r0], r12
- vld1.16 {q11, q12}, [r1]!
- vld1.u8 {d30}, [r0], r12
- vld1.16 {q12, q13}, [r1]!
- vld1.u8 {d31}, [r0], r12
- vld1.16 {q13}, [r1]
-
- vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
- vmovl.u8 q2, d30
- vmovl.u8 q3, d31
-
- vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
- vadd.s16 d2, d2, d22
- vadd.s16 d4, d4, d24
- vadd.s16 d6, d6, d26
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- add r1, r2, r3
-
- vst1.32 {d0[0]}, [r2] ;store result
- vst1.32 {d1[0]}, [r1], r3
- add r2, r1, r3
- vst1.32 {d2[0]}, [r1]
- vst1.32 {d3[0]}, [r2], r3
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/save_neon_reg.asm
+++ /dev/null
@@ -1,36 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_push_neon|
- EXPORT |vp9_pop_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_push_neon| PROC
- vst1.i64 {d8, d9, d10, d11}, [r0]!
- vst1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
-
-|vp9_pop_neon| PROC
- vld1.i64 {d8, d9, d10, d11}, [r0]!
- vld1.i64 {d12, d13, d14, d15}, [r0]!
- bx lr
-
- ENDP
-
- END
-
--- a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
+++ /dev/null
@@ -1,67 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_idct4x4llm_1_neon|
- EXPORT |vp8_dc_only_idct_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-; r0 short *input;
-; r1 short *output;
-; r2 int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_short_idct4x4llm_1_neon| PROC
- vld1.16 {d0[]}, [r0] ;load input[0]
-
- add r3, r1, r2
- add r12, r3, r2
-
- vrshr.s16 d0, d0, #3
-
- add r0, r12, r2
-
- vst1.16 {d0}, [r1]
- vst1.16 {d0}, [r3]
- vst1.16 {d0}, [r12]
- vst1.16 {d0}, [r0]
-
- bx lr
- ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-; r0 short input_dc;
-; r1 short *output;
-; r2 int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_dc_only_idct_neon| PROC
- vdup.16 d0, r0
-
- add r3, r1, r2
- add r12, r3, r2
-
- vrshr.s16 d0, d0, #3
-
- add r0, r12, r2
-
- vst1.16 {d0}, [r1]
- vst1.16 {d0}, [r3]
- vst1.16 {d0}, [r12]
- vst1.16 {d0}, [r0]
-
- bx lr
-
- ENDP
- END
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ /dev/null
@@ -1,122 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_idct4x4llm_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;*************************************************************
-;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
-;r0 short * input
-;r1 short * output
-;r2 int pitch
-;*************************************************************
-;static const int cospi8sqrt2minus1=20091;
-;static const int sinpi8sqrt2 =35468;
-;static const int rounding = 0;
-;Optimization note: The resulted data from dequantization are signed 13-bit data that is
-;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
-;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
-;result of the multiplication that is needed in IDCT.
-
-|vp8_short_idct4x4llm_neon| PROC
- adr r12, idct_coeff
- vld1.16 {q1, q2}, [r0]
- vld1.16 {d0}, [r12]
-
- vswp d3, d4 ;q2(vp[4] vp[12])
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
- vqadd.s16 q4, q4, q2
-
- ;d6 - c1:temp1
- ;d7 - d1:temp2
- ;d8 - d1:temp1
- ;d9 - c1:temp2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vswp d3, d4
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vrshr.s16 d2, d2, #3
- vrshr.s16 d3, d3, #3
- vrshr.s16 d4, d4, #3
- vrshr.s16 d5, d5, #3
-
- add r3, r1, r2
- add r12, r3, r2
- add r0, r12, r2
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vst1.16 {d2}, [r1]
- vst1.16 {d3}, [r3]
- vst1.16 {d4}, [r12]
- vst1.16 {d5}, [r0]
-
- bx lr
-
- ENDP
-
-;-----------------
-
-idct_coeff
- DCD 0x4e7b4e7b, 0x8a8c8a8c
-
-;20091, 20091, 35468, 35468
-
- END
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ /dev/null
@@ -1,490 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict16x16_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter16_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
-; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
-; the result can be negtive. So, I treat the result as s16. But, since it is also possible
-; that the result can be a large positive number (> 2^15-1), which could be confused as a
-; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
-; which ensures that the result stays in s16 range. Finally, saturated add the result by
-; applying 3rd filter coeff. Same applys to other filter functions.
-
-|vp8_sixtap_predict16x16_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter16_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter16x16_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter16x16_only
-
- sub sp, sp, #336 ;reserve space on stack for temporary storage
- mov lr, sp
-
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #7 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (21x16)
-filt_blk2d_fp16x16_loop_neon
- vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
- vld1.u8 {d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q9, d7, d0
- vmull.u8 q10, d9, d0
- vmull.u8 q11, d10, d0
- vmull.u8 q12, d12, d0
- vmull.u8 q13, d13, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d9, d10, #1
- vext.8 d30, d12, d13, #1
-
- vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q12, d30, d1
-
- vext.8 d28, d7, d8, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d13, d14, #1
-
- vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q11, d29, d1
- vmlsl.u8 q13, d30, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d9, d10, #4
- vext.8 d30, d12, d13, #4
-
- vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q12, d30, d4
-
- vext.8 d28, d7, d8, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d13, d14, #4
-
- vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q11, d29, d4
- vmlsl.u8 q13, d30, d4
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d9, d10, #5
- vext.8 d30, d12, d13, #5
-
- vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q10, d29, d5
- vmlal.u8 q12, d30, d5
-
- vext.8 d28, d7, d8, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d13, d14, #5
-
- vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q11, d29, d5
- vmlal.u8 q13, d30, d5
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d9, d10, #2
- vext.8 d30, d12, d13, #2
-
- vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q10, d29, d2
- vmlal.u8 q12, d30, d2
-
- vext.8 d28, d7, d8, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d13, d14, #2
-
- vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q11, d29, d2
- vmlal.u8 q13, d30, d2
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d9, d10, #3
- vext.8 d30, d12, d13, #3
-
- vext.8 d15, d7, d8, #3
- vext.8 d31, d10, d11, #3
- vext.8 d6, d13, d14, #3
-
- vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
-
- vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q10, q5
- vqadd.s16 q12, q6
-
- vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q7, d31, d3
- vmull.u8 q3, d6, d3
-
- subs r2, r2, #1
-
- vqadd.s16 q9, q6
- vqadd.s16 q11, q7
- vqadd.s16 q13, q3
-
- vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q9, #7
- vqrshrun.s16 d8, q10, #7
- vqrshrun.s16 d9, q11, #7
- vqrshrun.s16 d10, q12, #7
- vqrshrun.s16 d11, q13, #7
-
- vst1.u8 {d6, d7, d8}, [lr]! ;store result
- vst1.u8 {d9, d10, d11}, [lr]!
-
- bne filt_blk2d_fp16x16_loop_neon
-
-;Second pass: 16x16
-;secondpass_filter - do first 8-columns and then second 8-columns
- add r3, r12, r3, lsl #5
- sub lr, lr, #336
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- mov r3, #2 ;loop counter
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- mov r2, #16
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_sp16x16_outloop_neon
- vld1.u8 {d18}, [lr], r2 ;load src data
- vld1.u8 {d19}, [lr], r2
- vld1.u8 {d20}, [lr], r2
- vld1.u8 {d21}, [lr], r2
- mov r12, #4 ;loop counter
- vld1.u8 {d22}, [lr], r2
-
-secondpass_inner_loop_neon
- vld1.u8 {d23}, [lr], r2 ;load src data
- vld1.u8 {d24}, [lr], r2
- vld1.u8 {d25}, [lr], r2
- vld1.u8 {d26}, [lr], r2
-
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r12, r12, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q9, q11
- vst1.u8 {d7}, [r4], r5
- vmov q10, q12
- vst1.u8 {d8}, [r4], r5
- vmov d22, d26
- vst1.u8 {d9}, [r4], r5
-
- bne secondpass_inner_loop_neon
-
- subs r3, r3, #1
- sub lr, lr, #336
- add lr, lr, #8
-
- sub r4, r4, r5, lsl #4
- add r4, r4, #8
-
- bne filt_blk2d_sp16x16_outloop_neon
-
- add sp, sp, #336
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_filter16x16_only
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #8 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (column-2)
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First Pass: output_height lines x output_width columns (16x16)
-filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
- vld1.u8 {d9, d10, d11}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
-
- vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q7, d7, d0
- vmull.u8 q8, d9, d0
- vmull.u8 q9, d10, d0
-
- vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d21, d9, d10, #1
- vext.8 d22, d7, d8, #1
- vext.8 d23, d10, d11, #1
- vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d25, d9, d10, #4
- vext.8 d26, d7, d8, #4
- vext.8 d27, d10, d11, #4
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d9, d10, #5
-
- vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d21, d1
- vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q9, d23, d1
- vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d25, d4
- vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q9, d27, d4
- vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q8, d29, d5
-
- vext.8 d20, d7, d8, #5
- vext.8 d21, d10, d11, #5
- vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d23, d9, d10, #2
- vext.8 d24, d7, d8, #2
- vext.8 d25, d10, d11, #2
-
- vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d27, d9, d10, #3
- vext.8 d28, d7, d8, #3
- vext.8 d29, d10, d11, #3
-
- vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q9, d21, d5
- vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d23, d2
- vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q9, d25, d2
-
- vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q11, d27, d3
- vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q15, d29, d3
-
- vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q11
- vqadd.s16 q7, q12
- vqadd.s16 q9, q15
-
- subs r2, r2, #1
-
- vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q7, #7
- vqrshrun.s16 d8, q8, #7
- vqrshrun.s16 d9, q9, #7
-
- vst1.u8 {q3}, [r4], r5 ;store result
- vst1.u8 {q4}, [r4], r5
-
- bne filt_blk2d_fpo16x16_loop_neon
-
- pop {r4-r5,pc}
-
-;--------------------
-secondpass_filter16x16_only
-;Second pass: 16x16
- add r3, r12, r3, lsl #5
- sub r0, r0, r1, lsl #1
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- mov r3, #2 ;loop counter
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_spo16x16_outloop_neon
- vld1.u8 {d18}, [r0], r1 ;load src data
- vld1.u8 {d19}, [r0], r1
- vld1.u8 {d20}, [r0], r1
- vld1.u8 {d21}, [r0], r1
- mov r12, #4 ;loop counter
- vld1.u8 {d22}, [r0], r1
-
-secondpass_only_inner_loop_neon
- vld1.u8 {d23}, [r0], r1 ;load src data
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
-
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r12, r12, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q9, q11
- vst1.u8 {d7}, [r4], r5
- vmov q10, q12
- vst1.u8 {d8}, [r4], r5
- vmov d22, d26
- vst1.u8 {d9}, [r4], r5
-
- bne secondpass_only_inner_loop_neon
-
- subs r3, r3, #1
- sub r0, r0, r1, lsl #4
- sub r0, r0, r1, lsl #2
- sub r0, r0, r1
- add r0, r0, #8
-
- sub r4, r4, r5, lsl #4
- add r4, r4, #8
-
- bne filt_blk2d_spo16x16_outloop_neon
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
- END
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ /dev/null
@@ -1,422 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter4_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(lr) int dst_pitch
-
-|vp8_sixtap_predict_neon| PROC
- push {r4, lr}
-
- adr r12, filter4_coeff
- ldr r4, [sp, #8] ;load parameters from stack
- ldr lr, [sp, #12] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter4x4_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter4x4_only
-
- vabs.s32 q12, q14 ;get abs(filer_parameters)
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;go back 2 columns of src data
- sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
-
-;First pass: output_height lines x output_width columns (9x4)
- vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
- vmull.u8 q8, d20, d5
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
- vmlal.u8 q8, d10, d0
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d20, d1
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d10, d4
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d20, d2
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q10, d10, d3
-
- vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
- vld1.u8 {q4}, [r0], r1
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
-
- vld1.u8 {q5}, [r0], r1
- vld1.u8 {q6}, [r0], r1
-
- vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d28, q8, #7
-
- ;First Pass on rest 5-line data
- vld1.u8 {q11}, [r0], r1
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
- vmull.u8 q8, d20, d5
- vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5])
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
-
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
- vmlal.u8 q8, d10, d0
- vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
-
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d20, d1
- vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1])
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
-
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d10, d4
- vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4])
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
-
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d20, d2
- vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2])
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q10, d10, d3
- vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3])
-
- add r3, r12, r3, lsl #5
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
- vqadd.s16 q12, q11
-
- vext.8 d23, d27, d28, #4
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
-
- vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d30, q8, #7
- vqrshrun.s16 d31, q12, #7
-
-;Second pass: 4x4
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vext.8 d24, d28, d29, #4
- vext.8 d25, d29, d30, #4
- vext.8 d26, d30, d31, #4
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
- vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d28, d0
-
- vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
- vmull.u8 q6, d26, d5
-
- vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d30, d4
-
- vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q6, d24, d1
-
- vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d29, d2
-
- vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
- vmlal.u8 q6, d25, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q6, q4
-
- vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d4, q6, #7
-
- vst1.32 {d3[0]}, [r4] ;store result
- vst1.32 {d3[1]}, [r0]
- vst1.32 {d4[0]}, [r1]
- vst1.32 {d4[1]}, [r2]
-
- pop {r4, pc}
-
-
-;---------------------
-firstpass_filter4x4_only
- vabs.s32 q12, q14 ;get abs(filer_parameters)
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;go back 2 columns of src data
-
-;First pass: output_height lines x output_width columns (4x4)
- vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
-
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
- vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d19, d8, d9, #5
- vext.8 d20, d10, d11, #5
- vext.8 d21, d12, d13, #5
-
- vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
- vswp d11, d12
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
- vzip.32 d20, d21
- vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
- vmull.u8 q8, d20, d5
-
- vmov q4, q3 ;keep original src data in q4 q6
- vmov q6, q5
-
- vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
- vzip.32 d10, d11
- vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
- vshr.u64 q10, q6, #8
- vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
- vmlal.u8 q8, d10, d0
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #32 ;construct src_ptr[2]
- vshr.u64 q5, q6, #32
- vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d20, d1
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
- vzip.32 d10, d11
- vshr.u64 q9, q4, #16 ;construct src_ptr[0]
- vshr.u64 q10, q6, #16
- vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d10, d4
-
- vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
- vzip.32 d20, d21
- vshr.u64 q3, q4, #24 ;construct src_ptr[1]
- vshr.u64 q5, q6, #24
- vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d20, d2
-
- vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
- vzip.32 d10, d11
- vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q10, d10, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q10
-
- vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d28, q8, #7
-
- vst1.32 {d27[0]}, [r4] ;store result
- vst1.32 {d27[1]}, [r0]
- vst1.32 {d28[0]}, [r1]
- vst1.32 {d28[1]}, [r2]
-
- pop {r4, pc}
-
-
-;---------------------
-secondpass_filter4x4_only
- sub r0, r0, r1, lsl #1
- add r3, r12, r3, lsl #5
-
- vld1.32 {d27[0]}, [r0], r1 ;load src data
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.32 {d27[1]}, [r0], r1
- vabs.s32 q7, q5
- vld1.32 {d28[0]}, [r0], r1
- vabs.s32 q8, q6
- vld1.32 {d28[1]}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.32 {d29[0]}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.32 {d29[1]}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.32 {d30[0]}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.32 {d30[1]}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.32 {d31[0]}, [r0], r1
- vdup.8 d5, d16[4]
-
- vext.8 d23, d27, d28, #4
- vext.8 d24, d28, d29, #4
- vext.8 d25, d29, d30, #4
- vext.8 d26, d30, d31, #4
-
- vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d28, d0
-
- vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
- vmull.u8 q6, d26, d5
-
- vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d30, d4
-
- vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q6, d24, d1
-
- vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d29, d2
-
- vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
- vmlal.u8 q6, d25, d3
-
- add r0, r4, lr
- add r1, r0, lr
- add r2, r1, lr
-
- vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q6, q4
-
- vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d4, q6, #7
-
- vst1.32 {d3[0]}, [r4] ;store result
- vst1.32 {d3[1]}, [r0]
- vst1.32 {d4[0]}, [r1]
- vst1.32 {d4[1]}, [r2]
-
- pop {r4, pc}
-
- ENDP
-
-;-----------------
-
- END
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ /dev/null
@@ -1,473 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x4_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; r4 unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_sixtap_predict8x4_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter8_coeff
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter8x4_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter8x4_only
-
- sub sp, sp, #32 ;reserve space on stack for temporary storage
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- mov lr, sp
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
-
-;First pass: output_height lines x output_width columns (9x8)
- vld1.u8 {q3}, [r0], r1 ;load src data
- vdup.8 d3, d25[4]
- vld1.u8 {q4}, [r0], r1
- vdup.8 d4, d26[0]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d5, d26[4]
- vld1.u8 {q6}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {d22}, [lr]! ;store result
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {d23}, [lr]!
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {d24}, [lr]!
- vld1.u8 {q7}, [r0], r1
- vst1.u8 {d25}, [lr]!
-
- ;first_pass filtering on the rest 5-line data
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
- vmull.u8 q11, d12, d0
- vmull.u8 q12, d14, d0
-
- vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d28, d8, d9, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d12, d13, #1
- vext.8 d31, d14, d15, #1
-
- vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q9, d28, d1
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q11, d30, d1
- vmlsl.u8 q12, d31, d1
-
- vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d28, d8, d9, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d12, d13, #4
- vext.8 d31, d14, d15, #4
-
- vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q9, d28, d4
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q11, d30, d4
- vmlsl.u8 q12, d31, d4
-
- vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d28, d8, d9, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d12, d13, #2
- vext.8 d31, d14, d15, #2
-
- vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q9, d28, d2
- vmlal.u8 q10, d29, d2
- vmlal.u8 q11, d30, d2
- vmlal.u8 q12, d31, d2
-
- vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d28, d8, d9, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d12, d13, #5
- vext.8 d31, d14, d15, #5
-
- vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q9, d28, d5
- vmlal.u8 q10, d29, d5
- vmlal.u8 q11, d30, d5
- vmlal.u8 q12, d31, d5
-
- vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d28, d8, d9, #3
- vext.8 d29, d10, d11, #3
- vext.8 d30, d12, d13, #3
- vext.8 d31, d14, d15, #3
-
- vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d28, d3
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
- vmull.u8 q7, d31, d3
-
- vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q9, q4
- vqadd.s16 q10, q5
- vqadd.s16 q11, q6
- vqadd.s16 q12, q7
-
- vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d27, q9, #7
- vqrshrun.s16 d28, q10, #7
- vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
- vqrshrun.s16 d30, q12, #7
-
-;Second pass: 8x4
-;secondpass_filter
- add r3, r12, r3, lsl #5
- sub lr, lr, #32
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.u8 {q11}, [lr]!
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vld1.u8 {q12}, [lr]!
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
- vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d23, d0
- vmull.u8 q5, d24, d0
- vmull.u8 q6, d25, d0
-
- vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d24, d1
- vmlsl.u8 q5, d25, d1
- vmlsl.u8 q6, d26, d1
-
- vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d27, d4
- vmlsl.u8 q5, d28, d4
- vmlsl.u8 q6, d29, d4
-
- vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d25, d2
- vmlal.u8 q5, d26, d2
- vmlal.u8 q6, d27, d2
-
- vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d28, d5
- vmlal.u8 q5, d29, d5
- vmlal.u8 q6, d30, d5
-
- vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d26, d3
- vmull.u8 q9, d27, d3
- vmull.u8 q10, d28, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vst1.u8 {d7}, [r4], r5
- vst1.u8 {d8}, [r4], r5
- vst1.u8 {d9}, [r4], r5
-
- add sp, sp, #32
- pop {r4-r5,pc}
-
-;--------------------
-firstpass_filter8x4_only
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vld1.u8 {q4}, [r0], r1
- vdup.8 d1, d24[4]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d2, d25[0]
- vld1.u8 {q6}, [r0], r1
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First pass: output_height lines x output_width columns (4x8)
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [r4], r5 ;store result
- vst1.u8 {d23}, [r4], r5
- vst1.u8 {d24}, [r4], r5
- vst1.u8 {d25}, [r4], r5
-
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x4_only
-;Second pass: 8x4
- add r3, r12, r3, lsl #5
- sub r0, r0, r1, lsl #1
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vld1.u8 {d22}, [r0], r1
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.u8 {d25}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.u8 {d26}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.u8 {d27}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.u8 {d28}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.u8 {d29}, [r0], r1
- vdup.8 d5, d16[4]
- vld1.u8 {d30}, [r0], r1
-
- vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d23, d0
- vmull.u8 q5, d24, d0
- vmull.u8 q6, d25, d0
-
- vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d24, d1
- vmlsl.u8 q5, d25, d1
- vmlsl.u8 q6, d26, d1
-
- vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d27, d4
- vmlsl.u8 q5, d28, d4
- vmlsl.u8 q6, d29, d4
-
- vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d25, d2
- vmlal.u8 q5, d26, d2
- vmlal.u8 q6, d27, d2
-
- vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d28, d5
- vmlal.u8 q5, d29, d5
- vmlal.u8 q6, d30, d5
-
- vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d26, d3
- vmull.u8 q9, d27, d3
- vmull.u8 q10, d28, d3
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vst1.u8 {d6}, [r4], r5 ;store result
- vst1.u8 {d7}, [r4], r5
- vst1.u8 {d8}, [r4], r5
- vst1.u8 {d9}, [r4], r5
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
- END
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ /dev/null
@@ -1,524 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sixtap_predict8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-filter8_coeff
- DCD 0, 0, 128, 0, 0, 0, 0, 0
- DCD 0, -6, 123, 12, -1, 0, 0, 0
- DCD 2, -11, 108, 36, -8, 1, 0, 0
- DCD 0, -9, 93, 50, -6, 0, 0, 0
- DCD 3, -16, 77, 77, -16, 3, 0, 0
- DCD 0, -6, 50, 93, -9, 0, 0, 0
- DCD 1, -8, 36, 108, -11, 2, 0, 0
- DCD 0, -1, 12, 123, -6, 0, 0, 0
-
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pitch
-
-|vp8_sixtap_predict8x8_neon| PROC
- push {r4-r5, lr}
-
- adr r12, filter8_coeff
-
- ldr r4, [sp, #12] ;load parameters from stack
- ldr r5, [sp, #16] ;load parameters from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_filter8x8_only
-
- add r2, r12, r2, lsl #5 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {q14, q15}, [r2] ;load first_pass filter
-
- beq firstpass_filter8x8_only
-
- sub sp, sp, #64 ;reserve space on stack for temporary storage
- mov lr, sp
-
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #2 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
- sub r0, r0, r1, lsl #1
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
-
-;First pass: output_height lines x output_width columns (13x8)
- vld1.u8 {q3}, [r0], r1 ;load src data
- vdup.8 d3, d25[4]
- vld1.u8 {q4}, [r0], r1
- vdup.8 d4, d26[0]
- vld1.u8 {q5}, [r0], r1
- vdup.8 d5, d26[4]
- vld1.u8 {q6}, [r0], r1
-
-filt_blk2d_fp8x8_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
-
- subs r2, r2, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vld1.u8 {q3}, [r0], r1 ;load src data
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [lr]! ;store result
- vld1.u8 {q4}, [r0], r1
- vst1.u8 {d23}, [lr]!
- vld1.u8 {q5}, [r0], r1
- vst1.u8 {d24}, [lr]!
- vld1.u8 {q6}, [r0], r1
- vst1.u8 {d25}, [lr]!
-
- bne filt_blk2d_fp8x8_loop_neon
-
- ;first_pass filtering on the rest 5-line data
- ;vld1.u8 {q3}, [r0], r1 ;load src data
- ;vld1.u8 {q4}, [r0], r1
- ;vld1.u8 {q5}, [r0], r1
- ;vld1.u8 {q6}, [r0], r1
- vld1.u8 {q7}, [r0], r1
-
- vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
- vmull.u8 q11, d12, d0
- vmull.u8 q12, d14, d0
-
- vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d28, d8, d9, #1
- vext.8 d29, d10, d11, #1
- vext.8 d30, d12, d13, #1
- vext.8 d31, d14, d15, #1
-
- vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q9, d28, d1
- vmlsl.u8 q10, d29, d1
- vmlsl.u8 q11, d30, d1
- vmlsl.u8 q12, d31, d1
-
- vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d28, d8, d9, #4
- vext.8 d29, d10, d11, #4
- vext.8 d30, d12, d13, #4
- vext.8 d31, d14, d15, #4
-
- vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q9, d28, d4
- vmlsl.u8 q10, d29, d4
- vmlsl.u8 q11, d30, d4
- vmlsl.u8 q12, d31, d4
-
- vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d28, d8, d9, #2
- vext.8 d29, d10, d11, #2
- vext.8 d30, d12, d13, #2
- vext.8 d31, d14, d15, #2
-
- vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q9, d28, d2
- vmlal.u8 q10, d29, d2
- vmlal.u8 q11, d30, d2
- vmlal.u8 q12, d31, d2
-
- vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d28, d8, d9, #5
- vext.8 d29, d10, d11, #5
- vext.8 d30, d12, d13, #5
- vext.8 d31, d14, d15, #5
-
- vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q9, d28, d5
- vmlal.u8 q10, d29, d5
- vmlal.u8 q11, d30, d5
- vmlal.u8 q12, d31, d5
-
- vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d28, d8, d9, #3
- vext.8 d29, d10, d11, #3
- vext.8 d30, d12, d13, #3
- vext.8 d31, d14, d15, #3
-
- vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d28, d3
- vmull.u8 q5, d29, d3
- vmull.u8 q6, d30, d3
- vmull.u8 q7, d31, d3
-
- vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q9, q4
- vqadd.s16 q10, q5
- vqadd.s16 q11, q6
- vqadd.s16 q12, q7
-
- add r3, r12, r3, lsl #5
-
- vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
- sub lr, lr, #64
- vqrshrun.s16 d27, q9, #7
- vld1.u8 {q9}, [lr]! ;load intermediate data from stack
- vqrshrun.s16 d28, q10, #7
- vld1.u8 {q10}, [lr]!
-
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
-
- vqrshrun.s16 d29, q11, #7
- vld1.u8 {q11}, [lr]!
-
- vabs.s32 q7, q5
- vabs.s32 q8, q6
-
- vqrshrun.s16 d30, q12, #7
- vld1.u8 {q12}, [lr]!
-
-;Second pass: 8x8
- mov r3, #2 ;loop counter
-
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vdup.8 d1, d14[4]
- vdup.8 d2, d15[0]
- vdup.8 d3, d15[4]
- vdup.8 d4, d16[0]
- vdup.8 d5, d16[4]
-
-filt_blk2d_sp8x8_loop_neon
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r3, r3, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vmov q9, q11
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q10, q12
- vst1.u8 {d7}, [r4], r5
- vmov q11, q13
- vst1.u8 {d8}, [r4], r5
- vmov q12, q14
- vst1.u8 {d9}, [r4], r5
- vmov d26, d30
-
- bne filt_blk2d_sp8x8_loop_neon
-
- add sp, sp, #64
- pop {r4-r5,pc}
-
-;---------------------
-firstpass_filter8x8_only
- ;add r2, r12, r2, lsl #5 ;calculate filter location
- ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
- vabs.s32 q12, q14
- vabs.s32 q13, q15
-
- mov r2, #2 ;loop counter
- sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
-
- vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
- vdup.8 d1, d24[4]
- vdup.8 d2, d25[0]
- vdup.8 d3, d25[4]
- vdup.8 d4, d26[0]
- vdup.8 d5, d26[4]
-
-;First pass: output_height lines x output_width columns (8x8)
-filt_blk2d_fpo8x8_loop_neon
- vld1.u8 {q3}, [r0], r1 ;load src data
- vld1.u8 {q4}, [r0], r1
- vld1.u8 {q5}, [r0], r1
- vld1.u8 {q6}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q8, d8, d0
- vmull.u8 q9, d10, d0
- vmull.u8 q10, d12, d0
-
- vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
- vext.8 d29, d8, d9, #1
- vext.8 d30, d10, d11, #1
- vext.8 d31, d12, d13, #1
-
- vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q8, d29, d1
- vmlsl.u8 q9, d30, d1
- vmlsl.u8 q10, d31, d1
-
- vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
- vext.8 d29, d8, d9, #4
- vext.8 d30, d10, d11, #4
- vext.8 d31, d12, d13, #4
-
- vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q8, d29, d4
- vmlsl.u8 q9, d30, d4
- vmlsl.u8 q10, d31, d4
-
- vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
- vext.8 d29, d8, d9, #2
- vext.8 d30, d10, d11, #2
- vext.8 d31, d12, d13, #2
-
- vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q8, d29, d2
- vmlal.u8 q9, d30, d2
- vmlal.u8 q10, d31, d2
-
- vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
- vext.8 d29, d8, d9, #5
- vext.8 d30, d10, d11, #5
- vext.8 d31, d12, d13, #5
-
- vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q8, d29, d5
- vmlal.u8 q9, d30, d5
- vmlal.u8 q10, d31, d5
-
- vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
- vext.8 d29, d8, d9, #3
- vext.8 d30, d10, d11, #3
- vext.8 d31, d12, d13, #3
-
- vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q4, d29, d3
- vmull.u8 q5, d30, d3
- vmull.u8 q6, d31, d3
- ;
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- subs r2, r2, #1
-
- vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d23, q8, #7
- vqrshrun.s16 d24, q9, #7
- vqrshrun.s16 d25, q10, #7
-
- vst1.u8 {d22}, [r4], r5 ;store result
- vst1.u8 {d23}, [r4], r5
- vst1.u8 {d24}, [r4], r5
- vst1.u8 {d25}, [r4], r5
-
- bne filt_blk2d_fpo8x8_loop_neon
-
- pop {r4-r5,pc}
-
-;---------------------
-secondpass_filter8x8_only
- sub r0, r0, r1, lsl #1
- add r3, r12, r3, lsl #5
-
- vld1.u8 {d18}, [r0], r1 ;load src data
- vld1.s32 {q5, q6}, [r3] ;load second_pass filter
- vld1.u8 {d19}, [r0], r1
- vabs.s32 q7, q5
- vld1.u8 {d20}, [r0], r1
- vabs.s32 q8, q6
- vld1.u8 {d21}, [r0], r1
- mov r3, #2 ;loop counter
- vld1.u8 {d22}, [r0], r1
- vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
- vld1.u8 {d23}, [r0], r1
- vdup.8 d1, d14[4]
- vld1.u8 {d24}, [r0], r1
- vdup.8 d2, d15[0]
- vld1.u8 {d25}, [r0], r1
- vdup.8 d3, d15[4]
- vld1.u8 {d26}, [r0], r1
- vdup.8 d4, d16[0]
- vld1.u8 {d27}, [r0], r1
- vdup.8 d5, d16[4]
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
-;Second pass: 8x8
-filt_blk2d_spo8x8_loop_neon
- vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
- vmull.u8 q4, d19, d0
- vmull.u8 q5, d20, d0
- vmull.u8 q6, d21, d0
-
- vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
- vmlsl.u8 q4, d20, d1
- vmlsl.u8 q5, d21, d1
- vmlsl.u8 q6, d22, d1
-
- vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
- vmlsl.u8 q4, d23, d4
- vmlsl.u8 q5, d24, d4
- vmlsl.u8 q6, d25, d4
-
- vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
- vmlal.u8 q4, d21, d2
- vmlal.u8 q5, d22, d2
- vmlal.u8 q6, d23, d2
-
- vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
- vmlal.u8 q4, d24, d5
- vmlal.u8 q5, d25, d5
- vmlal.u8 q6, d26, d5
-
- vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
- vmull.u8 q8, d22, d3
- vmull.u8 q9, d23, d3
- vmull.u8 q10, d24, d3
-
- subs r3, r3, #1
-
- vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
- vqadd.s16 q8, q4
- vqadd.s16 q9, q5
- vqadd.s16 q10, q6
-
- vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
- vqrshrun.s16 d7, q8, #7
- vqrshrun.s16 d8, q9, #7
- vqrshrun.s16 d9, q10, #7
-
- vmov q9, q11
- vst1.u8 {d6}, [r4], r5 ;store result
- vmov q10, q12
- vst1.u8 {d7}, [r4], r5
- vmov q11, q13
- vst1.u8 {d8}, [r4], r5
- vmov q12, q14
- vst1.u8 {d9}, [r4], r5
- vmov d26, d30
-
- bne filt_blk2d_spo8x8_loop_neon
-
- pop {r4-r5,pc}
-
- ENDP
-
-;-----------------
-
- END
--- a/vp8/common/arm/recon_arm.h
+++ /dev/null
@@ -1,90 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef RECON_ARM_H
-#define RECON_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_recon_block(vp9_recon_b_armv6);
-extern prototype_recon_block(vp9_recon2b_armv6);
-extern prototype_recon_block(vp9_recon4b_armv6);
-
-extern prototype_copy_block(vp9_copy_mem8x8_v6);
-extern prototype_copy_block(vp9_copy_mem8x4_v6);
-extern prototype_copy_block(vp9_copy_mem16x16_v6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon
-#define vp8_recon_recon vp9_recon_b_armv6
-
-#undef vp8_recon_recon2
-#define vp8_recon_recon2 vp9_recon2b_armv6
-
-#undef vp8_recon_recon4
-#define vp8_recon_recon4 vp9_recon4b_armv6
-
-#undef vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
-
-#undef vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
-
-#undef vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_recon_block(vp9_recon_b_neon);
-extern prototype_recon_block(vp9_recon2b_neon);
-extern prototype_recon_block(vp9_recon4b_neon);
-
-extern prototype_copy_block(vp9_copy_mem8x8_neon);
-extern prototype_copy_block(vp9_copy_mem8x4_neon);
-extern prototype_copy_block(vp9_copy_mem16x16_neon);
-
-extern prototype_recon_macroblock(vp9_recon_mb_neon);
-
-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon
-#define vp8_recon_recon vp9_recon_b_neon
-
-#undef vp8_recon_recon2
-#define vp8_recon_recon2 vp9_recon2b_neon
-
-#undef vp8_recon_recon4
-#define vp8_recon_recon4 vp9_recon4b_neon
-
-#undef vp8_recon_copy8x8
-#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
-
-#undef vp8_recon_copy8x4
-#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
-
-#undef vp8_recon_copy16x16
-#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
-
-#undef vp8_recon_recon_mb
-#define vp8_recon_recon_mb vp9_recon_mb_neon
-
-#undef vp9_recon_build_intra_predictors_mby
-#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
-
-#undef vp9_recon_build_intra_predictors_mby_s
-#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/arm/reconintra_arm.c
+++ /dev/null
@@ -1,62 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/recon.h"
-
-#if HAVE_ARMV7
-extern void vp9_build_intra_predictors_mby_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
- unsigned char *y_buffer = xd->dst.y_buffer;
- unsigned char *ypred_ptr = xd->predictor;
- int y_stride = xd->dst.y_stride;
- int mode = xd->mode_info_context->mbmi.mode;
- int Up = xd->up_available;
- int Left = xd->left_available;
-
- vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
- y_stride, mode, Up, Left);
-}
-#endif
-
-
-#if HAVE_ARMV7
-extern void vp9_build_intra_predictors_mby_s_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
- unsigned char *y_buffer = xd->dst.y_buffer;
- unsigned char *ypred_ptr = xd->predictor;
- int y_stride = xd->dst.y_stride;
- int mode = xd->mode_info_context->mbmi.mode;
- int Up = xd->up_available;
- int Left = xd->left_available;
-
- vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
- y_stride, mode, Up, Left);
-}
-
-#endif
--- a/vp8/common/arm/subpixel_arm.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_ARM_H
-#define SUBPIXEL_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
-extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
-
-#undef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
-
-#undef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
-#endif
-#endif
-
-#if HAVE_ARMV7
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
-extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
-
-#undef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
-
-#undef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
-#endif
-#endif
-
-#endif
--- a/vp8/common/asm_com_offsets.c
+++ /dev/null
@@ -1,40 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_scale/yv12config.h"
-
-BEGIN
-
-/* vpx_scale */
-DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
-DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_ARMV7
-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
-#endif
--- a/vp8/common/blockd.c
+++ /dev/null
@@ -1,29 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-
-const unsigned char vp9_block2left[25] = {
- 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-const unsigned char vp9_block2above[25] = {
- 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
-
-const unsigned char vp9_block2left_8x8[25] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-const unsigned char vp9_block2above_8x8[25] = {
- 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-
--- a/vp8/common/blockd.h
+++ /dev/null
@@ -1,518 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BLOCKD_H
-#define __INC_BLOCKD_H
-
-void vpx_log(const char *format, ...);
-
-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
-#include "mv.h"
-#include "treecoder.h"
-#include "subpixel.h"
-#include "vpx_ports/mem.h"
-#include "common.h"
-
-#define TRUE 1
-#define FALSE 0
-
-// #define MODE_STATS
-
-/*#define DCPRED 1*/
-#define DCPREDSIMTHRESH 0
-#define DCPREDCNTTHRESH 3
-
-#define MB_FEATURE_TREE_PROBS 3
-#define PREDICTION_PROBS 3
-
-#define MBSKIP_CONTEXTS 3
-
-#define MAX_MB_SEGMENTS 4
-
-#define MAX_REF_LF_DELTAS 4
-#define MAX_MODE_LF_DELTAS 4
-
-/* Segment Feature Masks */
-#define SEGMENT_DELTADATA 0
-#define SEGMENT_ABSDATA 1
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
-#define MAX_MV_REFS 19
-#endif
-
-typedef struct {
- int r, c;
-} POS;
-
-typedef enum PlaneType {
- PLANE_TYPE_Y_NO_DC = 0,
- PLANE_TYPE_Y2,
- PLANE_TYPE_UV,
- PLANE_TYPE_Y_WITH_DC,
-} PLANE_TYPE;
-
-typedef char ENTROPY_CONTEXT;
-typedef struct {
- ENTROPY_CONTEXT y1[4];
- ENTROPY_CONTEXT u[2];
- ENTROPY_CONTEXT v[2];
- ENTROPY_CONTEXT y2;
-} ENTROPY_CONTEXT_PLANES;
-
-extern const unsigned char vp9_block2left[25];
-extern const unsigned char vp9_block2above[25];
-extern const unsigned char vp9_block2left_8x8[25];
-extern const unsigned char vp9_block2above_8x8[25];
-
-#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
- Dest = ((A)!=0) + ((B)!=0);
-
-typedef enum {
- KEY_FRAME = 0,
- INTER_FRAME = 1
-} FRAME_TYPE;
-
-typedef enum
-{
- SIXTAP = 0,
- BILINEAR = 1,
- EIGHTTAP = 2,
- EIGHTTAP_SHARP = 3,
- SWITCHABLE /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
-
-typedef enum
-{
- DC_PRED, /* average of above and left pixels */
- V_PRED, /* vertical prediction */
- H_PRED, /* horizontal prediction */
- D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */
- D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
- D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
- D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
- D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */
- D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */
- TM_PRED, /* Truemotion prediction */
- I8X8_PRED, /* 8x8 based prediction, each 8x8 has its own prediction mode */
- B_PRED, /* block based prediction, each block has its own prediction mode */
-
- NEARESTMV,
- NEARMV,
- ZEROMV,
- NEWMV,
- SPLITMV,
-
- MB_MODE_COUNT
-} MB_PREDICTION_MODE;
-
-// Segment level features.
-typedef enum {
- SEG_LVL_ALT_Q = 0, // Use alternate Quantizer ....
- SEG_LVL_ALT_LF = 1, // Use alternate loop filter value...
- SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame
- SEG_LVL_MODE = 3, // Optional Segment mode
- SEG_LVL_EOB = 4, // EOB end stop marker.
- SEG_LVL_TRANSFORM = 5, // Block transform size.
- SEG_LVL_MAX = 6 // Number of MB level features supported
-
-} SEG_LVL_FEATURES;
-
-// Segment level features.
-typedef enum {
- TX_4X4, // 4x4 dct transform
- TX_8X8, // 8x8 dct transform
- TX_16X16, // 16x16 dct transform
- TX_SIZE_MAX // Number of different transforms available
-} TX_SIZE;
-
-typedef enum {
- DCT_DCT = 0, // DCT in both horizontal and vertical
- ADST_DCT = 1, // ADST in vertical, DCT in horizontal
- DCT_ADST = 2, // DCT in vertical, ADST in horizontal
- ADST_ADST = 3 // ADST in both directions
-} TX_TYPE;
-
-#define VP9_YMODES (B_PRED + 1)
-#define VP9_UV_MODES (TM_PRED + 1)
-#define VP9_I8X8_MODES (TM_PRED + 1)
-#define VP9_I32X32_MODES (TM_PRED + 1)
-
-#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
-
-typedef enum {
- B_DC_PRED, /* average of above and left pixels */
- B_TM_PRED,
-
- B_VE_PRED, /* vertical prediction */
- B_HE_PRED, /* horizontal prediction */
-
- B_LD_PRED,
- B_RD_PRED,
-
- B_VR_PRED,
- B_VL_PRED,
- B_HD_PRED,
- B_HU_PRED,
-
- LEFT4X4,
- ABOVE4X4,
- ZERO4X4,
- NEW4X4,
-
- B_MODE_COUNT
-} B_PREDICTION_MODE;
-
-#define VP9_BINTRAMODES (B_HU_PRED + 1) /* 10 */
-#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
-
-typedef enum {
- PARTITIONING_16X8 = 0,
- PARTITIONING_8X16,
- PARTITIONING_8X8,
- PARTITIONING_4X4,
- NB_PARTITIONINGS,
-} SPLITMV_PARTITIONING_TYPE;
-
-/* For keyframes, intra block modes are predicted by the (already decoded)
- modes for the Y blocks to the left and above us; for interframes, there
- is a single probability table. */
-
-union b_mode_info {
- struct {
- B_PREDICTION_MODE first;
- TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE second;
-#endif
- } as_mode;
- struct {
- int_mv first;
- int_mv second;
- } as_mv;
-};
-
-typedef enum {
- INTRA_FRAME = 0,
- LAST_FRAME = 1,
- GOLDEN_FRAME = 2,
- ALTREF_FRAME = 3,
- MAX_REF_FRAMES = 4
-} MV_REFERENCE_FRAME;
-
-typedef struct {
- MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTRA_PRED
- MB_PREDICTION_MODE second_mode, second_uv_mode;
-#endif
- MV_REFERENCE_FRAME ref_frame, second_ref_frame;
- TX_SIZE txfm_size;
- int_mv mv[2]; // for each reference frame used
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
- int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
-
- SPLITMV_PARTITIONING_TYPE partitioning;
- unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
- unsigned char need_to_clamp_mvs;
- unsigned char need_to_clamp_secondmv;
- unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */
-
- // Flags used for prediction status of various bistream signals
- unsigned char seg_id_predicted;
- unsigned char ref_predicted;
-
- // Indicates if the mb is part of the image (1) vs border (0)
- // This can be useful in determining whether the MB provides
- // a valid predictor
- unsigned char mb_in_image;
-
-#if CONFIG_PRED_FILTER
- // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
- unsigned int pred_filter_enabled;
-#endif
- INTERPOLATIONFILTERTYPE interp_filter;
-
-#if CONFIG_SUPERBLOCKS
- // FIXME need a SB array of 4 MB_MODE_INFOs that
- // only needs one encoded_as_sb.
- unsigned char encoded_as_sb;
-#endif
-} MB_MODE_INFO;
-
-typedef struct {
- MB_MODE_INFO mbmi;
- union b_mode_info bmi[16];
-} MODE_INFO;
-
-typedef struct blockd {
- short *qcoeff;
- short *dqcoeff;
- unsigned char *predictor;
- short *diff;
- short *dequant;
-
- /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
- unsigned char **base_pre;
- unsigned char **base_second_pre;
- int pre;
- int pre_stride;
-
- unsigned char **base_dst;
- int dst;
- int dst_stride;
-
- int eob;
-
- union b_mode_info bmi;
-} BLOCKD;
-
-typedef struct macroblockd {
- DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */
- DECLARE_ALIGNED(16, unsigned char, predictor[384]);
- DECLARE_ALIGNED(16, short, qcoeff[400]);
- DECLARE_ALIGNED(16, short, dqcoeff[400]);
- DECLARE_ALIGNED(16, char, eobs[25]);
-
- /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
- BLOCKD block[25];
- int fullpixel_mask;
-
- YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
- struct {
- uint8_t *y_buffer, *u_buffer, *v_buffer;
- } second_pre;
- YV12_BUFFER_CONFIG dst;
-
- MODE_INFO *prev_mode_info_context;
- MODE_INFO *mode_info_context;
- int mode_info_stride;
-
- FRAME_TYPE frame_type;
-
- int up_available;
- int left_available;
-
- /* Y,U,V,Y2 */
- ENTROPY_CONTEXT_PLANES *above_context;
- ENTROPY_CONTEXT_PLANES *left_context;
-
- /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
- unsigned char segmentation_enabled;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation map. */
- unsigned char update_mb_segmentation_map;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
- unsigned char update_mb_segmentation_data;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
- unsigned char mb_segment_abs_delta;
-
- /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
- /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
-
- // Probability Tree used to code Segment number
- vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
-
-#if CONFIG_NEW_MVREF
- vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
-#endif
-
- // Segment features
- signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
- unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
-
- /* mode_based Loop filter adjustment */
- unsigned char mode_ref_lf_delta_enabled;
- unsigned char mode_ref_lf_delta_update;
-
- /* Delta values have the range +/- MAX_LOOP_FILTER */
- signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
- signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
- signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
- signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
-
- /* Distance of MB away from frame edges */
- int mb_to_left_edge;
- int mb_to_right_edge;
- int mb_to_top_edge;
- int mb_to_bottom_edge;
-
- unsigned int frames_since_golden;
- unsigned int frames_till_alt_ref_frame;
- vp9_subpix_fn_t subpixel_predict;
- vp9_subpix_fn_t subpixel_predict8x4;
- vp9_subpix_fn_t subpixel_predict8x8;
- vp9_subpix_fn_t subpixel_predict16x16;
- vp9_subpix_fn_t subpixel_predict_avg;
- vp9_subpix_fn_t subpixel_predict_avg8x4;
- vp9_subpix_fn_t subpixel_predict_avg8x8;
- vp9_subpix_fn_t subpixel_predict_avg16x16;
- int allow_high_precision_mv;
-
- int corrupted;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
- /* This is an intermediate buffer currently used in sub-pixel motion search
- * to keep a copy of the reference area. This buffer can be used for other
- * purpose.
- */
- DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
- struct VP9_COMMON_RTCD *rtcd;
-#endif
-
- int mb_index; // Index of the MB in the SB (0..3)
- int q_index;
-
-} MACROBLOCKD;
-
-#define ACTIVE_HT 110 // quantization stepsize threshold
-
-#define ACTIVE_HT8 300
-
-#define ACTIVE_HT16 300
-
-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
- B_PREDICTION_MODE b_mode;
- switch (mode) {
- case DC_PRED:
- b_mode = B_DC_PRED;
- break;
- case V_PRED:
- b_mode = B_VE_PRED;
- break;
- case H_PRED:
- b_mode = B_HE_PRED;
- break;
- case TM_PRED:
- b_mode = B_TM_PRED;
- break;
- case D45_PRED:
- b_mode = B_LD_PRED;
- break;
- case D135_PRED:
- b_mode = B_RD_PRED;
- break;
- case D117_PRED:
- b_mode = B_VR_PRED;
- break;
- case D153_PRED:
- b_mode = B_HD_PRED;
- break;
- case D27_PRED:
- b_mode = B_HU_PRED;
- break;
- case D63_PRED:
- b_mode = B_VL_PRED;
- break;
- default :
- // for debug purpose, to be removed after full testing
- assert(0);
- break;
- }
- return b_mode;
-}
-
-// transform mapping
-static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
- // map transform type
- TX_TYPE tx_type;
- switch (bmode) {
- case B_TM_PRED :
- case B_RD_PRED :
- tx_type = ADST_ADST;
- break;
-
- case B_VE_PRED :
- case B_VR_PRED :
- tx_type = ADST_DCT;
- break;
-
- case B_HE_PRED :
- case B_HD_PRED :
- case B_HU_PRED :
- tx_type = DCT_ADST;
- break;
-
- default :
- tx_type = DCT_DCT;
- break;
- }
- return tx_type;
-}
-
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode == B_PRED &&
- xd->q_index < ACTIVE_HT) {
- tx_type = txfm_map(b->bmi.as_mode.first);
- }
- return tx_type;
-}
-
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
- xd->q_index < ACTIVE_HT8) {
- tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));
- }
- return tx_type;
-}
-
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
- xd->q_index < ACTIVE_HT16) {
- tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
- }
- return tx_type;
-}
-
-static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
- TX_TYPE tx_type = DCT_DCT;
- int ib = (b - xd->block);
- if (ib >= 16)
- return tx_type;
- if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
- tx_type = get_tx_type_16x16(xd, b);
- }
- if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
- ib = (ib & 8) + ((ib & 4) >> 1);
- tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
- }
- if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
- tx_type = get_tx_type_4x4(xd, b);
- }
- return tx_type;
-}
-
-extern void vp9_build_block_doffsets(MACROBLOCKD *xd);
-extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);
-
-static void update_blockd_bmi(MACROBLOCKD *xd) {
- int i;
- int is_4x4;
- is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
- (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||
- (xd->mode_info_context->mbmi.mode == B_PRED);
-
- if (is_4x4) {
- for (i = 0; i < 16; i++) {
- xd->block[i].bmi = xd->mode_info_context->bmi[i];
- }
- }
-}
-#endif /* __INC_BLOCKD_H */
--- a/vp8/common/coefupdateprobs.h
+++ /dev/null
@@ -1,16 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* Update probabilities for the nodes in the token entropy tree.
- Generated file included by entropy.c */
-#define COEF_UPDATE_PROB 252
-#define COEF_UPDATE_PROB_8X8 252
-#define COEF_UPDATE_PROB_16X16 252
--- a/vp8/common/common.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef common_h
-#define common_h 1
-
-#include <assert.h>
-#include "vpx_config.h"
-/* Interface header for common constant data structures and lookup tables */
-
-#include "vpx_mem/vpx_mem.h"
-
-#include "common_types.h"
-
-/* Only need this for fixed-size arrays, for structs just assign. */
-
-#define vp9_copy( Dest, Src) { \
- assert( sizeof( Dest) == sizeof( Src)); \
- vpx_memcpy( Dest, Src, sizeof( Src)); \
- }
-
-/* Use this for variably-sized arrays. */
-
-#define vp9_copy_array( Dest, Src, N) { \
- assert( sizeof( *Dest) == sizeof( *Src)); \
- vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
- }
-
-#define vp9_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest));
-
-#define vp9_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
-
-#endif /* common_h */
--- a/vp8/common/common_types.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_COMMON_TYPES
-#define __INC_COMMON_TYPES
-
-#define TRUE 1
-#define FALSE 0
-
-#endif
--- a/vp8/common/context.c
+++ /dev/null
@@ -1,397 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropy.h"
-
-/* *** GENERATED FILE: DO NOT EDIT *** */
-
-#if 0
-int Contexts[vp8_coef_counter_dimen];
-
-const int default_contexts[vp8_coef_counter_dimen] = {
- {
- // Block Type ( 0 )
- {
- // Coeff Band ( 0 )
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- },
- {
- // Coeff Band ( 1 )
- {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593},
- {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987},
- {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104},
- },
- {
- // Coeff Band ( 2 )
- {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0},
- {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294},
- {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879},
- },
- {
- // Coeff Band ( 3 )
- {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0},
- {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302},
- { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611},
- },
- {
- // Coeff Band ( 4 )
- {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0},
- {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073},
- { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50},
- },
- {
- // Coeff Band ( 5 )
- {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0},
- {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362},
- { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190},
- },
- {
- // Coeff Band ( 6 )
- {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0},
- {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164},
- { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345},
- },
- {
- // Coeff Band ( 7 )
- { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319},
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8},
- },
- },
- {
- // Block Type ( 1 )
- {
- // Coeff Band ( 0 )
- {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289},
- {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914},
- {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620},
- },
- {
- // Coeff Band ( 1 )
- {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0},
- {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988},
- {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136},
- },
- {
- // Coeff Band ( 2 )
- {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0},
- {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980},
- {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429},
- },
- {
- // Coeff Band ( 3 )
- {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0},
- {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820},
- {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679},
- },
- {
- // Coeff Band ( 4 )
- {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0},
- {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127},
- { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101},
- },
- {
- // Coeff Band ( 5 )
- {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0},
- {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157},
- { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198},
- },
- {
- // Coeff Band ( 6 )
- {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0},
- {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195},
- { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507},
- },
- {
- // Coeff Band ( 7 )
- { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641},
- { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30},
- },
- },
- {
- // Block Type ( 2 )
- {
- // Coeff Band ( 0 )
- { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798},
- {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837},
- {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122},
- },
- {
- // Coeff Band ( 1 )
- {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0},
- {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063},
- {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047},
- },
- {
- // Coeff Band ( 2 )
- { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0},
- { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404},
- { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236},
- },
- {
- // Coeff Band ( 3 )
- { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157},
- { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300},
- },
- {
- // Coeff Band ( 4 )
- { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427},
- { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7},
- },
- {
- // Coeff Band ( 5 )
- { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652},
- { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30},
- },
- {
- // Coeff Band ( 6 )
- { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517},
- { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3},
- },
- {
- // Coeff Band ( 7 )
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- },
- },
- {
- // Block Type ( 3 )
- {
- // Coeff Band ( 0 )
- {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694},
- {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572},
- {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284},
- },
- {
- // Coeff Band ( 1 )
- {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0},
- {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280},
- {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460},
- },
- {
- // Coeff Band ( 2 )
- {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0},
- {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539},
- {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138},
- },
- {
- // Coeff Band ( 3 )
- {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0},
- {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181},
- {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267},
- },
- {
- // Coeff Band ( 4 )
- {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0},
- {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401},
- {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268},
- },
- {
- // Coeff Band ( 5 )
- {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0},
- {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811},
- {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527},
- },
- {
- // Coeff Band ( 6 )
- {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0},
- {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954},
- {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979},
- },
- {
- // Coeff Band ( 7 )
- { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459},
- { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13},
- },
- },
-};
-
-// Update probabilities for the nodes in the token entropy tree.
-const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {
- {
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
- {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
- {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
- {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
- {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- },
- {
- {
- {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
- {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
- },
- {
- {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- },
- {
- {
- {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
- {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
- {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
- },
- {
- {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- },
- {
- {
- {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
- {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
- {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
- {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- {
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
- },
- },
-};
-#endif
--- a/vp8/common/debugmodes.c
+++ /dev/null
@@ -1,146 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include "blockd.h"
-
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
- int frame) {
- int mb_row;
- int mb_col;
- int mb_index = 0;
- FILE *mvs = fopen("mvs.stt", "a");
-
- /* print out the macroblock Y modes */
- mb_index = 0;
- fprintf(mvs, "Mb Modes for Frame %d\n", frame);
-
- for (mb_row = 0; mb_row < rows; mb_row++) {
- for (mb_col = 0; mb_col < cols; mb_col++) {
-
- fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
-
- mb_index++;
- }
-
- fprintf(mvs, "\n");
- mb_index++;
- }
-
- fprintf(mvs, "\n");
-
- mb_index = 0;
- fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
- for (mb_row = 0; mb_row < rows; mb_row++) {
- for (mb_col = 0; mb_col < cols; mb_col++) {
-
- fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
-
- mb_index++;
- }
-
- fprintf(mvs, "\n");
- mb_index++;
- }
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock UV modes */
- mb_index = 0;
- fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
- for (mb_row = 0; mb_row < rows; mb_row++) {
- for (mb_col = 0; mb_col < cols; mb_col++) {
-
- fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
-
- mb_index++;
- }
-
- mb_index++;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the block modes */
- mb_index = 0;
- fprintf(mvs, "Mbs for Frame %d\n", frame);
- {
- int b_row;
-
- for (b_row = 0; b_row < 4 * rows; b_row++) {
- int b_col;
- int bindex;
-
- for (b_col = 0; b_col < 4 * cols; b_col++) {
- mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
- bindex = (b_row & 3) * 4 + (b_col & 3);
-
- if (mi[mb_index].mbmi.mode == B_PRED) {
- fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
-#if CONFIG_COMP_INTRA_PRED
- fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
-#endif
- } else
- fprintf(mvs, "xx ");
-
- }
-
- fprintf(mvs, "\n");
- }
- }
- fprintf(mvs, "\n");
-
- /* print out the macroblock mvs */
- mb_index = 0;
- fprintf(mvs, "MVs for Frame %d\n", frame);
-
- for (mb_row = 0; mb_row < rows; mb_row++) {
- for (mb_col = 0; mb_col < cols; mb_col++) {
- fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,
- mi[mb_index].mbmi.mv[0].as_mv.col / 2);
-
- mb_index++;
- }
-
- mb_index++;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the block modes */
- mb_index = 0;
- fprintf(mvs, "MVs for Frame %d\n", frame);
- {
- int b_row;
-
- for (b_row = 0; b_row < 4 * rows; b_row++) {
- int b_col;
- int bindex;
-
- for (b_col = 0; b_col < 4 * cols; b_col++) {
- mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
- bindex = (b_row & 3) * 4 + (b_col & 3);
- fprintf(mvs, "%3d:%-3d ",
- mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
- mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
-
- }
-
- fprintf(mvs, "\n");
- }
- }
- fprintf(mvs, "\n");
-
- fclose(mvs);
-}
--- a/vp8/common/default_coef_probs.h
+++ /dev/null
@@ -1,1377 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
-*/
-
-
-/*Generated file, included by entropy.c*/
-
-
-static const vp9_prob default_coef_probs [BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] = {
- {
- /* Block Type ( 0 ) */
- {
- /* Coeff Band ( 0 )*/
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
- { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
- { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
- { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
- { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
- { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
- { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
- { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
- { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
- { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
- { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
- { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
- { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
- { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
- { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
- { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
- { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
- { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
- { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 1 ) */
- {
- /* Coeff Band ( 0 )*/
- { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
- { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
- { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
- { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
- { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
- { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
- { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
- { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
- { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
- { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
- { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
- { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
- { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
- { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
- { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
- { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
- { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
- { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
- { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
- { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
- { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
- { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
- { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
- { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 2 ) */
- {
- /* Coeff Band ( 0 )*/
- { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
- { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
- { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
- { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
- { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
- { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
- { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
- { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
- { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
- { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
- { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
- { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 3 ) */
- {
- /* Coeff Band ( 0 )*/
- { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
- { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
- { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
- { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
- { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
- { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
- { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
- { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
- { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
- { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
- { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
- { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
- { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
- { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
- { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
- { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
- { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
- { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
- { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
- { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
- { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
- { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- }
-};
-
-static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] = {
- {
- /* Block Type ( 0 ) */
- {
- /* Coeff Band ( 0 )*/
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
- { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
- { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
- { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
- { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
- { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
- { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
- { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
- { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
- { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
- { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
- { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
- { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
- { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
- { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
- { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
- { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
- { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
- { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 1 ) */
- {
- /* Coeff Band ( 0 )*/
- { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
- { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
- { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
- { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
- { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
- { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
- { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
- { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
- { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
- { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
- { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
- { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
- { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
- { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
- { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
- { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
- { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
- { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
- { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
- { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
- { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
- { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
- { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
- { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 2 ) */
- {
- /* Coeff Band ( 0 )*/
- { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
- { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
- { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
- { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
- { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
- { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
- { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
- { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
- { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
- { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
- { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
- { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
- { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- },
- {
- /* Block Type ( 3 ) */
- {
- /* Coeff Band ( 0 )*/
- { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
- { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
- { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
- { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
- },
- {
- /* Coeff Band ( 1 )*/
- { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
- { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
- { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
- { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 2 )*/
- { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
- { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
- { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
- { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 3 )*/
- { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
- { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
- { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
- { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
- },
- {
- /* Coeff Band ( 4 )*/
- { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
- { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
- { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
- { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 5 )*/
- { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
- { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
- { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
- { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 6 )*/
- { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
- { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
- { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
- { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
- },
- {
- /* Coeff Band ( 7 )*/
- { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
- }
- }
-};
-
-static const vp9_prob
-default_coef_probs_8x8[BLOCK_TYPES_8X8]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] = {
- {
- /* block Type 0 */
- {
- /* Coeff Band 0 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 1 */
- { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
- { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
- { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
- { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
- },
- {
- /* Coeff Band 4 */
- { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
- { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
- { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- {
- /* Coeff Band 6 */
- { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
- { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- {
- /* Coeff Band 7 */
- { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
- { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
- }
- },
- {
- /* block Type 1 */
- {
- /* Coeff Band 0 */
- { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
- { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
- { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
- },
- {
- /* Coeff Band 1 */
- { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
- { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
- { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
- { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
- { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
- { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
- { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
- { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
- { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
- { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 4 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 6 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 7 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- }
- },
- {
- /* block Type 2 */
- {
- /* Coeff Band 0 */
- { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
- { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
- { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
- { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
- },
- {
- /* Coeff Band 1 */
- { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
- { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
- { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
- { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
- { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
- { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
- { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
- { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
- { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
- { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
- },
- {
- /* Coeff Band 4 */
- { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
- { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
- { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
- { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
- { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
- { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
- { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
- },
- {
- /* Coeff Band 6 */
- { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
- { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
- { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
- { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
- },
- {
- /* Coeff Band 7 */
- { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
- { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
- { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
- { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
- }
- },
- { /* block Type 3 */
- { /* Coeff Band 0 */
- { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
- { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
- { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
- { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
- { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
- { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
- },
- { /* Coeff Band 2 */
- { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
- { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
- { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
- { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
- },
- { /* Coeff Band 3 */
- { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
- { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
- { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
- { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
- { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
- { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
- { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
- },
- { /* Coeff Band 5 */
- { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
- { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
- { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
- { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
- },
- { /* Coeff Band 6 */
- { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
- { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
- { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
- { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
- { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
- { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- }
- }
-};
-
-static const vp9_prob
-default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] = {
- {
- /* block Type 0 */
- {
- /* Coeff Band 0 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 1 */
- { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
- { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
- { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
- { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
- },
- {
- /* Coeff Band 4 */
- { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
- { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
- { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- {
- /* Coeff Band 6 */
- { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
- { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- {
- /* Coeff Band 7 */
- { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
- { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
- }
- },
- {
- /* block Type 1 */
- {
- /* Coeff Band 0 */
- { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
- { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
- { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
- },
- {
- /* Coeff Band 1 */
- { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
- { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
- { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
- { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
- { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
- { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
- { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
- { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
- { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
- { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 4 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 6 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- {
- /* Coeff Band 7 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- }
- },
- {
- /* block Type 2 */
- {
- /* Coeff Band 0 */
- { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
- { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
- { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
- { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
- },
- {
- /* Coeff Band 1 */
- { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
- { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
- { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
- { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
- },
- {
- /* Coeff Band 2 */
- { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
- { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
- { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
- { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
- },
- {
- /* Coeff Band 3 */
- { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
- { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
- { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
- { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
- },
- {
- /* Coeff Band 4 */
- { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
- { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
- { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
- { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
- },
- {
- /* Coeff Band 5 */
- { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
- { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
- { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
- { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
- },
- {
- /* Coeff Band 6 */
- { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
- { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
- { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
- { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
- },
- {
- /* Coeff Band 7 */
- { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
- { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
- { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
- { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
- }
- },
- { /* block Type 3 */
- { /* Coeff Band 0 */
- { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
- { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
- { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
- { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
- { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
- { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
- },
- { /* Coeff Band 2 */
- { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
- { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
- { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
- { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
- },
- { /* Coeff Band 3 */
- { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
- { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
- { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
- { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
- { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
- { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
- { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
- },
- { /* Coeff Band 5 */
- { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
- { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
- { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
- { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
- },
- { /* Coeff Band 6 */
- { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
- { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
- { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
- { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
- { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
- { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- }
- }
-};
-
-static const vp9_prob
- default_coef_probs_16x16[BLOCK_TYPES_16X16]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] = {
- { /* block Type 0 */
- { /* Coeff Band 0 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
- { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
- },
- { /* Coeff Band 2 */
- { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
- { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 3 */
- { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
- { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 4 */
- { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
- { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
- { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
- { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
- { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
- }
- },
- { /* block Type 1 */
- { /* Coeff Band 0 */
- { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
- { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
- { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
- { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
- { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
- { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
- },
- { /* Coeff Band 2 */
- { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
- { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
- { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
- { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
- },
- { /* Coeff Band 3 */
- { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
- { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
- { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
- { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
- { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
- { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
- { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
- { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
- { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
- { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
- { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
- { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
- { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
- { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
- { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
- { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
- }
- },
- { /* block Type 2 */
- { /* Coeff Band 0 */
- { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
- { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
- { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
- { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
- { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
- { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
- },
- { /* Coeff Band 2 */
- { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
- { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
- { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
- { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
- },
- { /* Coeff Band 3 */
- { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
- { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
- { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
- { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
- { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
- { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
- { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
- { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
- { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
- { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
- { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
- { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
- { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
- { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
- { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
- { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
- }
- },
- { /* block Type 3 */
- { /* Coeff Band 0 */
- { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
- { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
- { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
- { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
- { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
- { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
- },
- { /* Coeff Band 2 */
- { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
- { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
- { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
- { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
- },
- { /* Coeff Band 3 */
- { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
- { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
- { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
- { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
- },
- { /* Coeff Band 4 */
- { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
- { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
- { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
- { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
- },
- { /* Coeff Band 5 */
- { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
- { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
- { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
- { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
- { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
- { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
- { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
- },
- { /* Coeff Band 7 */
- { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
- { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
- { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
- { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
- }
- }
-};
-
-static const vp9_prob
- default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] = {
- { /* block Type 0 */
- { /* Coeff Band 0 */
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
- { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
- { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
- },
- { /* Coeff Band 2 */
- { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
- { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
- { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 3 */
- { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
- { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
- { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 4 */
- { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
- { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
- { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
- { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
- { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
- { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
- { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
- { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
- }
- },
- { /* block Type 1 */
- { /* Coeff Band 0 */
- { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
- { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
- { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
- { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
- { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
- { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
- },
- { /* Coeff Band 2 */
- { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
- { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
- { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
- { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
- },
- { /* Coeff Band 3 */
- { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
- { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
- { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
- { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
- { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
- { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
- { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
- { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
- { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
- { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
- { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
- { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
- { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
- { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
- { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
- { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
- }
- },
- { /* block Type 2 */
- { /* Coeff Band 0 */
- { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
- { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
- { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
- { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
- { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
- { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
- },
- { /* Coeff Band 2 */
- { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
- { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
- { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
- { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
- },
- { /* Coeff Band 3 */
- { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
- { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
- { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
- { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
- },
- { /* Coeff Band 4 */
- { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
- { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
- { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
- { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
- },
- { /* Coeff Band 5 */
- { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
- { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
- { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
- { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
- { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
- { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
- { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
- },
- { /* Coeff Band 7 */
- { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
- { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
- { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
- { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
- }
- },
- { /* block Type 3 */
- { /* Coeff Band 0 */
- { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
- { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
- { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
- { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
- },
- { /* Coeff Band 1 */
- { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
- { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
- { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
- { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
- },
- { /* Coeff Band 2 */
- { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
- { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
- { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
- { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
- },
- { /* Coeff Band 3 */
- { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
- { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
- { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
- { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
- },
- { /* Coeff Band 4 */
- { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
- { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
- { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
- { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
- },
- { /* Coeff Band 5 */
- { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
- { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
- { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
- { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
- },
- { /* Coeff Band 6 */
- { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
- { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
- { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
- { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
- },
- { /* Coeff Band 7 */
- { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
- { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
- { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
- { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
- }
- }
-};
--- a/vp8/common/entropy.c
+++ /dev/null
@@ -1,447 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdio.h>
-
-#include "entropy.h"
-#include "string.h"
-#include "blockd.h"
-#include "onyxc_int.h"
-#include "entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-
-#define uchar unsigned char /* typedefs can clash */
-#define uint unsigned int
-
-typedef const uchar cuchar;
-typedef const uint cuint;
-
-typedef vp9_prob Prob;
-
-#include "coefupdateprobs.h"
-
-const int vp9_i8x8_block[4] = {0, 2, 8, 10};
-
-DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {
- 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {
- 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
-};
-
-DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
- 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {
- 0, 1, 4, 8,
- 5, 2, 3, 6,
- 9, 12, 13, 10,
- 7, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {
- 0, 4, 8, 12,
- 1, 5, 9, 13,
- 2, 6, 10, 14,
- 3, 7, 11, 15
-};
-DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {
- 0, 1, 2, 3,
- 4, 5, 6, 7,
- 8, 9, 10, 11,
- 12, 13, 14, 15
-};
-
-
-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
- 5, 3, 6, 3, 5, 4, 6, 6,
- 6, 5, 5, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7
- };
-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
- 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
- 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
- 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
- 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
-};
-
-// Table can be optimized.
-DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
- 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
- 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-};
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
- 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5,
- 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52,
- 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, 9, 24, 39,
- 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40,
- 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177,
- 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, 13, 28, 43, 58, 73,
- 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,
- 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120,
- 135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,
- 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, 92, 107, 122, 137, 152, 167,
- 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,
- 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,
- 215, 200, 185, 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201,
- 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,
- 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
- 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
-};
-
-
-/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-
-const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
-{
- -DCT_EOB_TOKEN, 2, /* 0 = EOB */
- -ZERO_TOKEN, 4, /* 1 = ZERO */
- -ONE_TOKEN, 6, /* 2 = ONE */
- 8, 12, /* 3 = LOW_VAL */
- -TWO_TOKEN, 10, /* 4 = TWO */
- -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
- 14, 16, /* 6 = HIGH_LOW */
- -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
- 18, 20, /* 8 = CAT_THREEFOUR */
- -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
- -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
-};
-
-struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits. Probabilities are constant and
- do not depend on previously encoded bits */
-
-static const Prob Pcat1[] = { 159};
-static const Prob Pcat2[] = { 165, 145};
-static const Prob Pcat3[] = { 173, 148, 140};
-static const Prob Pcat4[] = { 176, 155, 140, 135};
-static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const Prob Pcat6[] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
-
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
- int i = 0;
-
- while (++i < n) {
- p[0] = p[1] = i << 1;
- p += 2;
- }
-
- p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
- init_bit_tree(cat1, 1);
- init_bit_tree(cat2, 2);
- init_bit_tree(cat3, 3);
- init_bit_tree(cat4, 4);
- init_bit_tree(cat5, 5);
- init_bit_tree(cat6, 13);
-}
-
-vp9_extra_bit_struct vp9_extra_bits[12] = {
- { 0, 0, 0, 0},
- { 0, 0, 0, 1},
- { 0, 0, 0, 2},
- { 0, 0, 0, 3},
- { 0, 0, 0, 4},
- { cat1, Pcat1, 1, 5},
- { cat2, Pcat2, 2, 7},
- { cat3, Pcat3, 3, 11},
- { cat4, Pcat4, 4, 19},
- { cat5, Pcat5, 5, 35},
- { cat6, Pcat6, 13, 67},
- { 0, 0, 0, 0}
-};
-
-#include "default_coef_probs.h"
-
-void vp9_default_coef_probs(VP9_COMMON *pc) {
- vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
- sizeof(pc->fc.coef_probs));
- vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
- sizeof(pc->fc.hybrid_coef_probs));
-
- vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
- sizeof(pc->fc.coef_probs_8x8));
- vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
- sizeof(pc->fc.hybrid_coef_probs_8x8));
-
- vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
- sizeof(pc->fc.coef_probs_16x16));
- vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
- default_hybrid_coef_probs_16x16,
- sizeof(pc->fc.hybrid_coef_probs_16x16));
-}
-
-void vp9_coef_tree_initialize() {
- init_bit_trees();
- vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-// #define COEF_COUNT_TESTING
-
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_KEY 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
-void vp9_adapt_coef_probs(VP9_COMMON *cm) {
- int t, i, j, k, count;
- unsigned int branch_ct[ENTROPY_NODES][2];
- vp9_prob coef_probs[ENTROPY_NODES];
- int update_factor; /* denominator 256 */
- int factor;
- int count_sat;
-
- // printf("Frame type: %d\n", cm->frame_type);
- if (cm->frame_type == KEY_FRAME) {
- update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
- count_sat = COEF_COUNT_SAT_KEY;
- } else if (cm->last_frame_type == KEY_FRAME) {
- update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
- count_sat = COEF_COUNT_SAT_AFTER_KEY;
- } else {
- update_factor = COEF_MAX_UPDATE_FACTOR;
- count_sat = COEF_COUNT_SAT;
- }
-
-#ifdef COEF_COUNT_TESTING
- {
- printf("static const unsigned int\ncoef_counts"
- "[BLOCK_TYPES] [COEF_BANDS]"
- "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
- for (i = 0; i < BLOCK_TYPES; ++i) {
- printf(" {\n");
- for (j = 0; j < COEF_BANDS; ++j) {
- printf(" {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- printf(" {");
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- printf("%d, ", cm->fc.coef_counts[i][j][k][t]);
- printf("},\n");
- }
- printf(" },\n");
- }
- printf(" },\n");
- }
- printf("};\n");
- printf("static const unsigned int\ncoef_counts_8x8"
- "[BLOCK_TYPES_8X8] [COEF_BANDS]"
- "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
- for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
- printf(" {\n");
- for (j = 0; j < COEF_BANDS; ++j) {
- printf(" {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- printf(" {");
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);
- printf("},\n");
- }
- printf(" },\n");
- }
- printf(" },\n");
- }
- printf("};\n");
- printf("static const unsigned int\nhybrid_coef_counts"
- "[BLOCK_TYPES] [COEF_BANDS]"
- "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
- for (i = 0; i < BLOCK_TYPES; ++i) {
- printf(" {\n");
- for (j = 0; j < COEF_BANDS; ++j) {
- printf(" {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- printf(" {");
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);
- printf("},\n");
- }
- printf(" },\n");
- }
- printf(" },\n");
- }
- printf("};\n");
- }
-#endif
-
- for (i = 0; i < BLOCK_TYPES; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],
- 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;
- else cm->fc.coef_probs[i][j][k][t] = prob;
- }
- }
-
- for (i = 0; i < BLOCK_TYPES; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],
- 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;
- else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
- }
- }
-
- for (i = 0; i < BLOCK_TYPES_8X8; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],
- 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;
- else cm->fc.coef_probs_8x8[i][j][k][t] = prob;
- }
- }
-
- for (i = 0; i < BLOCK_TYPES_8X8; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],
- 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *
- (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;
- else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
- }
- }
-
- for (i = 0; i < BLOCK_TYPES_16X16; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *
- (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;
- else cm->fc.coef_probs_16x16[i][j][k][t] = prob;
- }
- }
-
- for (i = 0; i < BLOCK_TYPES_16X16; ++i)
- for (j = 0; j < COEF_BANDS; ++j)
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);
- for (t = 0; t < ENTROPY_NODES; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +
- (int)coef_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;
- else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;
- else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
- }
- }
-}
--- a/vp8/common/entropy.h
+++ /dev/null
@@ -1,112 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPY_H
-#define __INC_ENTROPY_H
-
-#include "treecoder.h"
-#include "blockd.h"
-#include "common.h"
-#include "coefupdateprobs.h"
-
-extern const int vp9_i8x8_block[4];
-
-/* Coefficient token alphabet */
-
-#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
-#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
-#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
-#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
-#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
-#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
-#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */
-#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
-#define MAX_ENTROPY_TOKENS 12
-#define ENTROPY_NODES 11
-#define EOSB_TOKEN 127 /* Not signalled, encoder only */
-
-extern const vp9_tree_index vp9_coef_tree[];
-
-extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-typedef struct {
- vp9_tree_p tree;
- const vp9_prob *prob;
- int Len;
- int base_val;
-} vp9_extra_bit_struct;
-
-extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */
-
-#define PROB_UPDATE_BASELINE_COST 7
-
-#define MAX_PROB 255
-#define DCT_MAX_VALUE 8192
-
-/* Coefficients are predicted via a 3-dimensional probability table. */
-
-/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-#define BLOCK_TYPES 4
-
-#define BLOCK_TYPES_8X8 4
-
-#define BLOCK_TYPES_16X16 4
-
-/* Middle dimension is a coarsening of the coefficient's
- position within the 4x4 DCT. */
-
-#define COEF_BANDS 8
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
-extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
-
-/* Inside dimension is 3-valued measure of nearby complexity, that is,
- the extent to which nearby coefficients are nonzero. For the first
- coefficient (DC, unless block type is 0), we look at the (already encoded)
- blocks above and to the left of the current block. The context index is
- then the number (0,1,or 2) of these blocks having nonzero coefficients.
- After decoding a coefficient, the measure is roughly the size of the
- most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
- Note that the intuitive meaning of this measure changes as coefficients
- are decoded, e.g., prior to the first token, a zero means that my neighbors
- are empty while, after the first token, because of the use of end-of-block,
- a zero means we just decoded a zero and hence guarantees that a non-zero
- coefficient will appear later in this block. However, this shift
- in meaning is perfectly OK because our context depends also on the
- coefficient band (and since zigzag positions 0, 1, and 2 are in
- distinct bands). */
-
-/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
-#define PREV_COEF_CONTEXTS 4
-
-#define SUBEXP_PARAM 4 /* Subexponential code parameter */
-#define MODULUS_PARAM 13 /* Modulus parameter */
-
-extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
-
-struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
-
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
-
-extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
-void vp9_coef_tree_initialize(void);
-
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
-void vp9_adapt_coef_probs(struct VP9Common *);
-
-#endif
--- a/vp8/common/entropymode.c
+++ /dev/null
@@ -1,614 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxc_int.h"
-#include "modecont.h"
-#include "vpx_mem/vpx_mem.h"
-
-
-static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
- /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */
- {12, 6, 5, 5, 5, 5, 5, 5, 5, 2, 22, 200},
- {25, 13, 13, 7, 7, 7, 7, 7, 7, 6, 27, 160},
- {31, 17, 18, 8, 8, 8, 8, 8, 8, 9, 26, 139},
- {40, 22, 23, 8, 8, 8, 8, 8, 8, 12, 27, 116},
- {53, 26, 28, 8, 8, 8, 8, 8, 8, 13, 26, 94},
- {68, 33, 35, 8, 8, 8, 8, 8, 8, 17, 20, 68},
- {78, 38, 38, 8, 8, 8, 8, 8, 8, 19, 16, 52},
- {89, 42, 42, 8, 8, 8, 8, 8, 8, 21, 12, 34},
-};
-
-static const unsigned int y_mode_cts [VP9_YMODES] = {
- /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */
- 98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
-};
-
-static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
- /* DC V H D45 135 117 153 D27 D63 TM */
- { 200, 15, 15, 10, 10, 10, 10, 10, 10, 6}, /* DC */
- { 130, 75, 10, 10, 10, 10, 10, 10, 10, 6}, /* V */
- { 130, 10, 75, 10, 10, 10, 10, 10, 10, 6}, /* H */
- { 130, 15, 10, 75, 10, 10, 10, 10, 10, 6}, /* D45 */
- { 150, 15, 10, 10, 75, 10, 10, 10, 10, 6}, /* D135 */
- { 150, 15, 10, 10, 10, 75, 10, 10, 10, 6}, /* D117 */
- { 150, 15, 10, 10, 10, 10, 75, 10, 10, 6}, /* D153 */
- { 150, 15, 10, 10, 10, 10, 10, 75, 10, 6}, /* D27 */
- { 150, 15, 10, 10, 10, 10, 10, 10, 75, 6}, /* D63 */
- { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
- { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
- { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */
-};
-
-static const unsigned int i8x8_mode_cts [VP9_I8X8_MODES] = {
- /* DC V H D45 135 117 153 D27 D63 TM */
- 73, 49, 61, 30, 30, 30, 30, 30, 30, 13
-};
-
-static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
- // DC V H D45 135 117 153 D27 D63 TM
- { 160, 24, 24, 20, 20, 20, 20, 20, 20, 8}, /* DC */
- { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */
- { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */
- { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */
- { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */
- { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */
- { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */
- { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
- { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
- { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
- { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
- { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */
-};
-
-static const unsigned int bmode_cts[VP9_BINTRAMODES] = {
- /* DC TM VE HE LD RD VR VL HD HU */
- 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
-};
-
-typedef enum {
- SUBMVREF_NORMAL,
- SUBMVREF_LEFT_ZED,
- SUBMVREF_ABOVE_ZED,
- SUBMVREF_LEFT_ABOVE_SAME,
- SUBMVREF_LEFT_ABOVE_ZED
-} sumvfref_t;
-
-int vp9_mv_cont(const int_mv *l, const int_mv *a) {
- int lez = (l->as_int == 0);
- int aez = (a->as_int == 0);
- int lea = (l->as_int == a->as_int);
-
- if (lea && lez)
- return SUBMVREF_LEFT_ABOVE_ZED;
-
- if (lea)
- return SUBMVREF_LEFT_ABOVE_SAME;
-
- if (aez)
- return SUBMVREF_ABOVE_ZED;
-
- if (lez)
- return SUBMVREF_LEFT_ZED;
-
- return SUBMVREF_NORMAL;
-}
-
-const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};
-
-const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
- { 147, 136, 18 },
- { 106, 145, 1 },
- { 179, 121, 1 },
- { 223, 1, 34 },
- { 208, 1, 1 }
-};
-
-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
- {
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 1, 1, 1, 1,
- 1, 1, 1, 1,
- }, {
- 0, 0, 1, 1,
- 0, 0, 1, 1,
- 0, 0, 1, 1,
- 0, 0, 1, 1,
- }, {
- 0, 0, 1, 1,
- 0, 0, 1, 1,
- 2, 2, 3, 3,
- 2, 2, 3, 3,
- }, {
- 0, 1, 2, 3,
- 4, 5, 6, 7,
- 8, 9, 10, 11,
- 12, 13, 14, 15,
- },
-};
-
-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
-
-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
-
-/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-
-const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */
-{
- -B_DC_PRED, 2, /* 0 = DC_NODE */
- -B_TM_PRED, 4, /* 1 = TM_NODE */
- -B_VE_PRED, 6, /* 2 = VE_NODE */
- 8, 12, /* 3 = COM_NODE */
- -B_HE_PRED, 10, /* 4 = HE_NODE */
- -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
- -B_LD_PRED, 14, /* 6 = LD_NODE */
- -B_VL_PRED, 16, /* 7 = VL_NODE */
- -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
-};
-
-/* Again, these trees use the same probability indices as their
- explicitly-programmed predecessors. */
-const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
- 2, 14,
- -DC_PRED, 4,
- 6, 8,
- -D45_PRED, -D135_PRED,
- 10, 12,
- -D117_PRED, -D153_PRED,
- -D27_PRED, -D63_PRED,
- 16, 18,
- -V_PRED, -H_PRED,
- -TM_PRED, 20,
- -B_PRED, -I8X8_PRED
-};
-
-const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
- 2, 14,
- -DC_PRED, 4,
- 6, 8,
- -D45_PRED, -D135_PRED,
- 10, 12,
- -D117_PRED, -D153_PRED,
- -D27_PRED, -D63_PRED,
- 16, 18,
- -V_PRED, -H_PRED,
- -TM_PRED, 20,
- -B_PRED, -I8X8_PRED
-};
-
-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
- 2, 14,
- -DC_PRED, 4,
- 6, 8,
- -D45_PRED, -D135_PRED,
- 10, 12,
- -D117_PRED, -D153_PRED,
- -D27_PRED, -D63_PRED,
- -V_PRED, 16,
- -H_PRED, -TM_PRED
-};
-
-const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
- 2, 14,
- -DC_PRED, 4,
- 6, 8,
- -D45_PRED, -D135_PRED,
- 10, 12,
- -D117_PRED, -D153_PRED,
- -D27_PRED, -D63_PRED,
- -V_PRED, 16,
- -H_PRED, -TM_PRED
-};
-
-const vp9_tree_index vp9_mbsplit_tree[6] = {
- -PARTITIONING_4X4, 2,
- -PARTITIONING_8X8, 4,
- -PARTITIONING_16X8, -PARTITIONING_8X16,
-};
-
-const vp9_tree_index vp9_mv_ref_tree[8] = {
- -ZEROMV, 2,
- -NEARESTMV, 4,
- -NEARMV, 6,
- -NEWMV, -SPLITMV
-};
-
-#if CONFIG_SUPERBLOCKS
-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
- -ZEROMV, 2,
- -NEARESTMV, 4,
- -NEARMV, -NEWMV
-};
-#endif
-
-const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
- -LEFT4X4, 2,
- -ABOVE4X4, 4,
- -ZERO4X4, -NEW4X4
-};
-
-struct vp9_token_struct vp9_bmode_encodings [VP9_BINTRAMODES];
-struct vp9_token_struct vp9_ymode_encodings [VP9_YMODES];
-#if CONFIG_SUPERBLOCKS
-struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];
-#endif
-struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];
-struct vp9_token_struct vp9_uv_mode_encodings [VP9_UV_MODES];
-struct vp9_token_struct vp9_i8x8_mode_encodings [VP9_I8X8_MODES];
-struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];
-
-struct vp9_token_struct vp9_mv_ref_encoding_array [VP9_MVREFS];
-#if CONFIG_SUPERBLOCKS
-struct vp9_token_struct vp9_sb_mv_ref_encoding_array [VP9_MVREFS];
-#endif
-struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];
-
-void vp9_init_mbmode_probs(VP9_COMMON *x) {
- unsigned int bct [VP9_YMODES] [2]; /* num Ymodes > num UV modes */
-
- vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
- vp9_ymode_tree, x->fc.ymode_prob,
- bct, y_mode_cts, 256, 1);
- {
- int i;
- for (i = 0; i < 8; i++) {
- vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
- vp9_kf_ymode_tree, x->kf_ymode_prob[i],
- bct, kf_y_mode_cts[i], 256, 1);
-#if CONFIG_SUPERBLOCKS
- vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
- vp9_sb_kf_ymode_encodings,
- vp9_sb_ymode_tree,
- x->sb_kf_ymode_prob[i], bct,
- kf_y_mode_cts[i], 256, 1);
-#endif
- }
- }
- {
- int i;
- for (i = 0; i < VP9_YMODES; i++) {
- vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
- vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
- bct, kf_uv_mode_cts[i], 256, 1);
- vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
- vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
- bct, uv_mode_cts[i], 256, 1);
- }
- }
-
- vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
- vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
- bct, i8x8_mode_cts, 256, 1);
-
- vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
- sizeof(vp9_sub_mv_ref_prob2));
- vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
- vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
- sizeof(vp9_switchable_interp_prob));
-}
-
-
-static void intra_bmode_probs_from_distribution(
- vp9_prob p [VP9_BINTRAMODES - 1],
- unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],
- const unsigned int events [VP9_BINTRAMODES]) {
- vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
- vp9_bmode_tree, p, branch_ct,
- events, 256, 1);
-}
-
-void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {
- unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];
- intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
-}
-
-void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]
- [VP9_BINTRAMODES - 1]) {
- unsigned int branch_ct[VP9_BINTRAMODES - 1][2];
- int i, j;
-
- for (i = 0; i < VP9_BINTRAMODES; i++) {
- for (j = 0; j < VP9_BINTRAMODES; j++) {
- intra_bmode_probs_from_distribution(
- p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);
- }
- }
-}
-
-#if VP9_SWITCHABLE_FILTERS == 3
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
- -0, 2,
- -1, -2
-};
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
- EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
- [VP9_SWITCHABLE_FILTERS-1] = {
- {248, 192}, { 32, 248}, { 32, 32}, {192, 160}
-};
-#elif VP9_SWITCHABLE_FILTERS == 2
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
- -0, -1,
-};
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
- [VP9_SWITCHABLE_FILTERS-1] = {
- {248},
- { 64},
- {192},
-};
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
- EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
-#endif
-
-void vp9_entropy_mode_init() {
- vp9_tokens_from_tree(vp9_bmode_encodings, vp9_bmode_tree);
- vp9_tokens_from_tree(vp9_ymode_encodings, vp9_ymode_tree);
- vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
-#if CONFIG_SUPERBLOCKS
- vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);
-#endif
- vp9_tokens_from_tree(vp9_uv_mode_encodings, vp9_uv_mode_tree);
- vp9_tokens_from_tree(vp9_i8x8_mode_encodings, vp9_i8x8_mode_tree);
- vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
- vp9_tokens_from_tree(vp9_switchable_interp_encodings,
- vp9_switchable_interp_tree);
-
- vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
- vp9_mv_ref_tree, NEARESTMV);
-#if CONFIG_SUPERBLOCKS
- vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
- vp9_sb_mv_ref_tree, NEARESTMV);
-#endif
- vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
- vp9_sub_mv_ref_tree, LEFT4X4);
-}
-
-void vp9_init_mode_contexts(VP9_COMMON *pc) {
- vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));
- vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));
-
- vpx_memcpy(pc->fc.mode_context,
- vp9_default_mode_contexts,
- sizeof(pc->fc.mode_context));
- vpx_memcpy(pc->fc.mode_context_a,
- vp9_default_mode_contexts_a,
- sizeof(pc->fc.mode_context_a));
-
-}
-
-void vp9_accum_mv_refs(VP9_COMMON *pc,
- MB_PREDICTION_MODE m,
- const int ct[4]) {
- int (*mv_ref_ct)[4][2];
-
- if (pc->refresh_alt_ref_frame)
- mv_ref_ct = pc->fc.mv_ref_ct_a;
- else
- mv_ref_ct = pc->fc.mv_ref_ct;
-
- if (m == ZEROMV) {
- ++mv_ref_ct [ct[0]] [0] [0];
- } else {
- ++mv_ref_ct [ct[0]] [0] [1];
- if (m == NEARESTMV) {
- ++mv_ref_ct [ct[1]] [1] [0];
- } else {
- ++mv_ref_ct [ct[1]] [1] [1];
- if (m == NEARMV) {
- ++mv_ref_ct [ct[2]] [2] [0];
- } else {
- ++mv_ref_ct [ct[2]] [2] [1];
- if (m == NEWMV) {
- ++mv_ref_ct [ct[3]] [3] [0];
- } else {
- ++mv_ref_ct [ct[3]] [3] [1];
- }
- }
- }
- }
-}
-
-#define MVREF_COUNT_SAT 20
-#define MVREF_MAX_UPDATE_FACTOR 144
-void vp9_update_mode_context(VP9_COMMON *pc) {
- int i, j;
- int (*mv_ref_ct)[4][2];
- int (*mode_context)[4];
-
- if (pc->refresh_alt_ref_frame) {
- mv_ref_ct = pc->fc.mv_ref_ct_a;
- mode_context = pc->fc.mode_context_a;
- } else {
- mv_ref_ct = pc->fc.mv_ref_ct;
- mode_context = pc->fc.mode_context;
- }
-
- for (j = 0; j < 6; j++) {
- for (i = 0; i < 4; i++) {
- int this_prob;
- int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
- int factor;
- {
- this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
- count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
- factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
- this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +
- this_prob * factor + 128) >> 8;
- this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;
- mode_context[j][i] = this_prob;
- }
- }
- }
-}
-
-#ifdef MODE_STATS
-#include "vp8/common/modecont.h"
-void print_mode_contexts(VP9_COMMON *pc) {
- int j, i;
- printf("\n====================\n");
- for (j = 0; j < 6; j++) {
- for (i = 0; i < 4; i++) {
- printf("%4d ", pc->fc.mode_context[j][i]);
- }
- printf("\n");
- }
- printf("====================\n");
- for (j = 0; j < 6; j++) {
- for (i = 0; i < 4; i++) {
- printf("%4d ", pc->fc.mode_context_a[j][i]);
- }
- printf("\n");
- }
-}
-#endif
-
-// #define MODE_COUNT_TESTING
-#define MODE_COUNT_SAT 20
-#define MODE_MAX_UPDATE_FACTOR 144
-void vp9_adapt_mode_probs(VP9_COMMON *cm) {
- int i, t, count, factor;
- unsigned int branch_ct[32][2];
- vp9_prob ymode_probs[VP9_YMODES - 1];
- vp9_prob uvmode_probs[VP9_UV_MODES - 1];
- vp9_prob bmode_probs[VP9_BINTRAMODES - 1];
- vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
- vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
- vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#ifdef MODE_COUNT_TESTING
- printf("static const unsigned int\nymode_counts"
- "[VP9_YMODES] = {\n");
- for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\nuv_mode_counts"
- "[VP9_YMODES] [VP9_UV_MODES] = {\n");
- for (i = 0; i < VP9_YMODES; ++i) {
- printf(" {");
- for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
- printf("},\n");
- }
- printf("};\n");
- printf("static const unsigned int\nbmode_counts"
- "[VP9_BINTRAMODES] = {\n");
- for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\ni8x8_mode_counts"
- "[VP9_I8X8_MODES] = {\n");
- for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\nsub_mv_ref_counts"
- "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
- for (i = 0; i < SUBMVREF_COUNT; ++i) {
- printf(" {");
- for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
- printf("},\n");
- }
- printf("};\n");
- printf("static const unsigned int\nmbsplit_counts"
- "[VP9_NUMMBSPLITS] = {\n");
- for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
- printf("};\n");
-#endif
- vp9_tree_probs_from_distribution(
- VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
- ymode_probs, branch_ct, cm->fc.ymode_counts,
- 256, 1);
- for (t = 0; t < VP9_YMODES - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
- (int)ymode_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.ymode_prob[t] = 1;
- else if (prob > 255) cm->fc.ymode_prob[t] = 255;
- else cm->fc.ymode_prob[t] = prob;
- }
- for (i = 0; i < VP9_YMODES; ++i) {
- vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
- vp9_uv_mode_tree, uvmode_probs, branch_ct,
- cm->fc.uv_mode_counts[i], 256, 1);
- for (t = 0; t < VP9_UV_MODES - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
- (int)uvmode_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;
- else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;
- else cm->fc.uv_mode_prob[i][t] = prob;
- }
- }
- vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
- vp9_bmode_tree, bmode_probs, branch_ct,
- cm->fc.bmode_counts, 256, 1);
- for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
- (int)bmode_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.bmode_prob[t] = 1;
- else if (prob > 255) cm->fc.bmode_prob[t] = 255;
- else cm->fc.bmode_prob[t] = prob;
- }
- vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
- vp9_i8x8_mode_tree, i8x8_mode_probs,
- branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
- for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
- (int)i8x8_mode_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;
- else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;
- else cm->fc.i8x8_mode_prob[t] = prob;
- }
- for (i = 0; i < SUBMVREF_COUNT; ++i) {
- vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
- vp9_sub_mv_ref_encoding_array,
- vp9_sub_mv_ref_tree, sub_mv_ref_probs,
- branch_ct, cm->fc.sub_mv_ref_counts[i],
- 256, 1);
- for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
- (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;
- else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;
- else cm->fc.sub_mv_ref_prob[i][t] = prob;
- }
- }
- vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
- vp9_mbsplit_tree, mbsplit_probs, branch_ct,
- cm->fc.mbsplit_counts, 256, 1);
- for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
- int prob;
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
- (int)mbsplit_probs[t] * factor + 128) >> 8;
- if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;
- else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;
- else cm->fc.mbsplit_prob[t] = prob;
- }
-}
--- a/vp8/common/entropymode.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPYMODE_H
-#define __INC_ENTROPYMODE_H
-
-#include "blockd.h"
-#include "treecoder.h"
-
-#define SUBMVREF_COUNT 5
-#define VP9_NUMMBSPLITS 4
-
-typedef const int vp9_mbsplit[16];
-
-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
-
-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS]; /* # of subsets */
-
-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
-
-extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
-
-extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];
-
-extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-
-extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]
- [VP9_BINTRAMODES]
- [VP9_BINTRAMODES];
-
-extern const vp9_tree_index vp9_bmode_tree[];
-
-extern const vp9_tree_index vp9_ymode_tree[];
-extern const vp9_tree_index vp9_kf_ymode_tree[];
-extern const vp9_tree_index vp9_uv_mode_tree[];
-#define vp9_sb_ymode_tree vp9_uv_mode_tree
-extern const vp9_tree_index vp9_i8x8_mode_tree[];
-extern const vp9_tree_index vp9_mbsplit_tree[];
-extern const vp9_tree_index vp9_mv_ref_tree[];
-extern const vp9_tree_index vp9_sb_mv_ref_tree[];
-extern const vp9_tree_index vp9_sub_mv_ref_tree[];
-
-extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];
-extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
-extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-
-/* Inter mode values do not start at zero */
-
-extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
-
-void vp9_entropy_mode_init(void);
-
-struct VP9Common;
-
-void vp9_init_mbmode_probs(struct VP9Common *x);
-
-extern void vp9_init_mode_contexts(struct VP9Common *pc);
-
-extern void vp9_update_mode_context(struct VP9Common *pc);
-
-extern void vp9_accum_mv_refs(struct VP9Common *pc,
- MB_PREDICTION_MODE m,
- const int ct[4]);
-
-void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);
-
-void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]
- [VP9_BINTRAMODES - 1]);
-
-void vp9_adapt_mode_probs(struct VP9Common *);
-
-#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
-
-extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
- [VP9_SWITCHABLE_FILTERS];
-
-extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
-
-extern const vp9_tree_index vp9_switchable_interp_tree
- [2 * (VP9_SWITCHABLE_FILTERS - 1)];
-
-extern struct vp9_token_struct vp9_switchable_interp_encodings
- [VP9_SWITCHABLE_FILTERS];
-
-extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
-
-#endif
--- a/vp8/common/entropymv.c
+++ /dev/null
@@ -1,465 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxc_int.h"
-#include "entropymv.h"
-
-//#define MV_COUNT_TESTING
-
-#define MV_COUNT_SAT 16
-#define MV_MAX_UPDATE_FACTOR 160
-
-#if CONFIG_NEW_MVREF
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH 1000000
-#else
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH 8
-#endif
-
-/* Smooth or bias the mv-counts before prob computation */
-/* #define SMOOTH_MV_COUNTS */
-
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
- -MV_JOINT_ZERO, 2,
- -MV_JOINT_HNZVZ, 4,
- -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
-};
-struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];
-
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
- -MV_CLASS_0, 2,
- -MV_CLASS_1, 4,
- 6, 8,
- -MV_CLASS_2, -MV_CLASS_3,
- 10, 12,
- -MV_CLASS_4, -MV_CLASS_5,
- -MV_CLASS_6, -MV_CLASS_7,
-};
-struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
-
-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
- -0, -1,
-};
-struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
-
-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
- -0, 2,
- -1, 4,
- -2, -3
-};
-struct vp9_token_struct vp9_mv_fp_encodings[4];
-
-const nmv_context vp9_default_nmv_context = {
- {32, 64, 96},
- {
- { /* vert component */
- 128, /* sign */
- {224, 144, 192, 168, 192, 176, 192}, /* class */
- {216}, /* class0 */
- {136, 140, 148, 160, 176, 192, 224}, /* bits */
- {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
- {64, 96, 64}, /* fp */
- 160, /* class0_hp bit */
- 128, /* hp */
- },
- { /* hor component */
- 128, /* sign */
- {216, 128, 176, 160, 176, 176, 192}, /* class */
- {208}, /* class0 */
- {136, 140, 148, 160, 176, 192, 224}, /* bits */
- {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
- {64, 96, 64}, /* fp */
- 160, /* class0_hp bit */
- 128, /* hp */
- }
- },
-};
-
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {
- if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;
- else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;
- else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;
- else return MV_JOINT_HNZVNZ;
-}
-
-#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
-
-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
- MV_CLASS_TYPE c;
- if (z < CLASS0_SIZE * 8) c = MV_CLASS_0;
- else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1;
- else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2;
- else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3;
- else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4;
- else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5;
- else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6;
- else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
- else assert(0);
- if (offset)
- *offset = z - mv_class_base(c);
- return c;
-}
-
-int vp9_use_nmv_hp(const MV *ref) {
- if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
- (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)
- return 1;
- else
- return 0;
-}
-
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
- return mv_class_base(c) + offset;
-}
-
-static void increment_nmv_component_count(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
- assert (v != 0); /* should not be zero */
- mvcomp->mvcount[MV_MAX + v] += incr;
-}
-
-static void increment_nmv_component(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
- int s, z, c, o, d, e, f;
- assert (v != 0); /* should not be zero */
- s = v < 0;
- mvcomp->sign[s] += incr;
- z = (s ? -v : v) - 1; /* magnitude - 1 */
-
- c = vp9_get_mv_class(z, &o);
- mvcomp->classes[c] += incr;
-
- d = (o >> 3); /* int mv data */
- f = (o >> 1) & 3; /* fractional pel mv data */
- e = (o & 1); /* high precision mv data */
- if (c == MV_CLASS_0) {
- mvcomp->class0[d] += incr;
- } else {
- int i, b;
- b = c + CLASS0_BITS - 1; /* number of bits */
- for (i = 0; i < b; ++i)
- mvcomp->bits[i][((d >> i) & 1)] += incr;
- }
-
- /* Code the fractional pel bits */
- if (c == MV_CLASS_0) {
- mvcomp->class0_fp[d][f] += incr;
- } else {
- mvcomp->fp[f] += incr;
- }
-
- /* Code the high precision bit */
- if (usehp) {
- if (c == MV_CLASS_0) {
- mvcomp->class0_hp[e] += incr;
- } else {
- mvcomp->hp[e] += incr;
- }
- }
-}
-
-#ifdef SMOOTH_MV_COUNTS
-static void smooth_counts(nmv_component_counts *mvcomp) {
- static const int flen = 3; // (filter_length + 1) / 2
- static const int fval[] = {8, 3, 1};
- static const int fvalbits = 4;
- int i;
- unsigned int smvcount[MV_VALS];
- vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
- smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
- for (i = flen - 1; i <= MV_VALS - flen; ++i) {
- int j, s = smvcount[i] * fval[0];
- for (j = 1; j < flen; ++j)
- s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
- mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
- }
-}
-#endif
-
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
- int v;
- vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
- for (v = 1; v <= MV_MAX; v++) {
- increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
- increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
- }
-}
-
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
- int usehp) {
- MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
- mvctx->joints[j]++;
- usehp = usehp && vp9_use_nmv_hp(ref);
- if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
- increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
- }
- if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
- increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
- }
-}
-
-static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
- unsigned int ct[2]) {
- int factor;
- int prob;
- int count = ct[0] + ct[1];
- if (count) {
- count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
- factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
- prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
- prob += !prob;
- prob = (prob > 255 ? 255 : prob);
- *dest = prob;
- }
-}
-
-void vp9_counts_to_nmv_context(
- nmv_context_counts *NMVcount,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]) {
- int i, j, k;
- counts_to_context(&NMVcount->comps[0], usehp);
- counts_to_context(&NMVcount->comps[1], usehp);
- vp9_tree_probs_from_distribution(MV_JOINTS,
- vp9_mv_joint_encodings,
- vp9_mv_joint_tree,
- prob->joints,
- branch_ct_joint,
- NMVcount->joints,
- 256, 1);
- for (i = 0; i < 2; ++i) {
- prob->comps[i].sign =
- vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
- branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
- branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
- vp9_tree_probs_from_distribution(MV_CLASSES,
- vp9_mv_class_encodings,
- vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- NMVcount->comps[i].classes,
- 256, 1);
- vp9_tree_probs_from_distribution(CLASS0_SIZE,
- vp9_mv_class0_encodings,
- vp9_mv_class0_tree,
- prob->comps[i].class0,
- branch_ct_class0[i],
- NMVcount->comps[i].class0,
- 256, 1);
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
- NMVcount->comps[i].bits[j]);
- branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
- branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
- }
- }
- for (i = 0; i < 2; ++i) {
- for (k = 0; k < CLASS0_SIZE; ++k) {
- vp9_tree_probs_from_distribution(4,
- vp9_mv_fp_encodings,
- vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
- branch_ct_class0_fp[i][k],
- NMVcount->comps[i].class0_fp[k],
- 256, 1);
- }
- vp9_tree_probs_from_distribution(4,
- vp9_mv_fp_encodings,
- vp9_mv_fp_tree,
- prob->comps[i].fp,
- branch_ct_fp[i],
- NMVcount->comps[i].fp,
- 256, 1);
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
- NMVcount->comps[i].class0_hp);
- branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
- branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];
-
- prob->comps[i].hp =
- vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
- branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
- branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
- }
- }
-}
-
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
- int i, j, k;
- nmv_context prob;
- unsigned int branch_ct_joint[MV_JOINTS - 1][2];
- unsigned int branch_ct_sign[2][2];
- unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
- unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
- unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
- unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
- unsigned int branch_ct_fp[2][4 - 1][2];
- unsigned int branch_ct_class0_hp[2][2];
- unsigned int branch_ct_hp[2][2];
-#ifdef MV_COUNT_TESTING
- printf("joints count: ");
- for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
- printf("\n"); fflush(stdout);
- printf("signs count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
- printf("\n"); fflush(stdout);
- printf("classes count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_CLASSES; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
- printf("\n"); fflush(stdout);
- }
- printf("class0 count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
- printf("\n"); fflush(stdout);
- }
- printf("bits count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
- cm->fc.NMVcount.comps[i].bits[j][1]);
- printf("\n"); fflush(stdout);
- }
- printf("class0_fp count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 4; ++k)
- printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("\n"); fflush(stdout);
- }
- printf("fp count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < 4; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
- printf("\n"); fflush(stdout);
- }
- if (usehp) {
- printf("class0_hp count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
- cm->fc.NMVcount.comps[i].class0_hp[1]);
- printf("\n"); fflush(stdout);
- printf("hp count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
- cm->fc.NMVcount.comps[i].hp[1]);
- printf("\n"); fflush(stdout);
- }
-#endif
-#ifdef SMOOTH_MV_COUNTS
- smooth_counts(&cm->fc.NMVcount.comps[0]);
- smooth_counts(&cm->fc.NMVcount.comps[1]);
-#endif
- vp9_counts_to_nmv_context(&cm->fc.NMVcount,
- &prob,
- usehp,
- branch_ct_joint,
- branch_ct_sign,
- branch_ct_classes,
- branch_ct_class0,
- branch_ct_bits,
- branch_ct_class0_fp,
- branch_ct_fp,
- branch_ct_class0_hp,
- branch_ct_hp);
-
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- adapt_prob(&cm->fc.nmvc.joints[j],
- cm->fc.pre_nmvc.joints[j],
- prob.joints[j],
- branch_ct_joint[j]);
- }
- for (i = 0; i < 2; ++i) {
- adapt_prob(&cm->fc.nmvc.comps[i].sign,
- cm->fc.pre_nmvc.comps[i].sign,
- prob.comps[i].sign,
- branch_ct_sign[i]);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- adapt_prob(&cm->fc.nmvc.comps[i].classes[j],
- cm->fc.pre_nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- branch_ct_classes[i][j]);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- adapt_prob(&cm->fc.nmvc.comps[i].class0[j],
- cm->fc.pre_nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- branch_ct_class0[i][j]);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
- cm->fc.pre_nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- branch_ct_bits[i][j]);
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- for (k = 0; k < 3; ++k) {
- adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],
- cm->fc.pre_nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- branch_ct_class0_fp[i][j][k]);
- }
- }
- for (j = 0; j < 3; ++j) {
- adapt_prob(&cm->fc.nmvc.comps[i].fp[j],
- cm->fc.pre_nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- branch_ct_fp[i][j]);
- }
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
- cm->fc.pre_nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- branch_ct_class0_hp[i]);
- adapt_prob(&cm->fc.nmvc.comps[i].hp,
- cm->fc.pre_nmvc.comps[i].hp,
- prob.comps[i].hp,
- branch_ct_hp[i]);
- }
- }
-}
-
-void vp9_entropy_mv_init() {
- vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
- vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
- vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
- vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
-void vp9_init_mv_probs(VP9_COMMON *cm) {
- vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
-}
--- a/vp8/common/entropymv.h
+++ /dev/null
@@ -1,129 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENTROPYMV_H
-#define __INC_ENTROPYMV_H
-
-#include "treecoder.h"
-#include "vpx_config.h"
-#include "blockd.h"
-
-struct VP9Common;
-
-void vp9_entropy_mv_init();
-void vp9_init_mv_probs(struct VP9Common *cm);
-
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
-
-#define VP9_NMV_UPDATE_PROB 255
-//#define MV_GROUP_UPDATE
-
-#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */
-
-/* Symbols for coding which components are zero jointly */
-#define MV_JOINTS 4
-typedef enum {
- MV_JOINT_ZERO = 0, /* Zero vector */
- MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
- MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
- MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
-} MV_JOINT_TYPE;
-
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
-
-/* Symbols for coding magnitude class of nonzero components */
-#define MV_CLASSES 8
-typedef enum {
- MV_CLASS_0 = 0, /* (0, 2] integer pel */
- MV_CLASS_1 = 1, /* (2, 4] integer pel */
- MV_CLASS_2 = 2, /* (4, 8] integer pel */
- MV_CLASS_3 = 3, /* (8, 16] integer pel */
- MV_CLASS_4 = 4, /* (16, 32] integer pel */
- MV_CLASS_5 = 5, /* (32, 64] integer pel */
- MV_CLASS_6 = 6, /* (64, 128] integer pel */
- MV_CLASS_7 = 7, /* (128, 256] integer pel */
-} MV_CLASS_TYPE;
-
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];
-
-#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
-#define CLASS0_SIZE (1 << CLASS0_BITS)
-#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
-
-#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
-#define MV_MAX ((1 << MV_MAX_BITS) - 1)
-#define MV_VALS ((MV_MAX << 1) + 1)
-
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
-extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
-extern struct vp9_token_struct vp9_mv_fp_encodings[4];
-
-typedef struct {
- vp9_prob sign;
- vp9_prob classes[MV_CLASSES - 1];
- vp9_prob class0[CLASS0_SIZE - 1];
- vp9_prob bits[MV_OFFSET_BITS];
- vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
- vp9_prob fp[4 - 1];
- vp9_prob class0_hp;
- vp9_prob hp;
-} nmv_component;
-
-typedef struct {
- vp9_prob joints[MV_JOINTS - 1];
- nmv_component comps[2];
-} nmv_context;
-
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv);
-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
-
-
-typedef struct {
- unsigned int mvcount[MV_VALS];
- unsigned int sign[2];
- unsigned int classes[MV_CLASSES];
- unsigned int class0[CLASS0_SIZE];
- unsigned int bits[MV_OFFSET_BITS][2];
- unsigned int class0_fp[CLASS0_SIZE][4];
- unsigned int fp[4];
- unsigned int class0_hp[2];
- unsigned int hp[2];
-} nmv_component_counts;
-
-typedef struct {
- unsigned int joints[MV_JOINTS];
- nmv_component_counts comps[2];
-} nmv_context_counts;
-
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
- int usehp);
-extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
- nmv_context_counts *NMVcount,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]);
-
-#endif
--- a/vp8/common/extend.c
+++ /dev/null
@@ -1,169 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "extend.h"
-#include "vpx_mem/vpx_mem.h"
-
-static void copy_and_extend_plane(unsigned char *s, /* source */
- int sp, /* source pitch */
- unsigned char *d, /* destination */
- int dp, /* destination pitch */
- int h, /* height */
- int w, /* width */
- int et, /* extend top border */
- int el, /* extend left border */
- int eb, /* extend bottom border */
- int er) { /* extend right border */
- int i;
- unsigned char *src_ptr1, *src_ptr2;
- unsigned char *dest_ptr1, *dest_ptr2;
- int linesize;
-
- /* copy the left and right most columns out */
- src_ptr1 = s;
- src_ptr2 = s + w - 1;
- dest_ptr1 = d - el;
- dest_ptr2 = d + w;
-
- for (i = 0; i < h; i++) {
- vpx_memset(dest_ptr1, src_ptr1[0], el);
- vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
- vpx_memset(dest_ptr2, src_ptr2[0], er);
- src_ptr1 += sp;
- src_ptr2 += sp;
- dest_ptr1 += dp;
- dest_ptr2 += dp;
- }
-
- /* Now copy the top and bottom lines into each line of the respective
- * borders
- */
- src_ptr1 = d - el;
- src_ptr2 = d + dp * (h - 1) - el;
- dest_ptr1 = d + dp * (-et) - el;
- dest_ptr2 = d + dp * (h) - el;
- linesize = el + er + w;
-
- for (i = 0; i < et; i++) {
- vpx_memcpy(dest_ptr1, src_ptr1, linesize);
- dest_ptr1 += dp;
- }
-
- for (i = 0; i < eb; i++) {
- vpx_memcpy(dest_ptr2, src_ptr2, linesize);
- dest_ptr2 += dp;
- }
-}
-
-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst) {
- int et = dst->border;
- int el = dst->border;
- int eb = dst->border + dst->y_height - src->y_height;
- int er = dst->border + dst->y_width - src->y_width;
-
- copy_and_extend_plane(src->y_buffer, src->y_stride,
- dst->y_buffer, dst->y_stride,
- src->y_height, src->y_width,
- et, el, eb, er);
-
- et = dst->border >> 1;
- el = dst->border >> 1;
- eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
- er = (dst->border >> 1) + dst->uv_width - src->uv_width;
-
- copy_and_extend_plane(src->u_buffer, src->uv_stride,
- dst->u_buffer, dst->uv_stride,
- src->uv_height, src->uv_width,
- et, el, eb, er);
-
- copy_and_extend_plane(src->v_buffer, src->uv_stride,
- dst->v_buffer, dst->uv_stride,
- src->uv_height, src->uv_width,
- et, el, eb, er);
-}
-
-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst,
- int srcy, int srcx,
- int srch, int srcw) {
- int et = dst->border;
- int el = dst->border;
- int eb = dst->border + dst->y_height - src->y_height;
- int er = dst->border + dst->y_width - src->y_width;
- int src_y_offset = srcy * src->y_stride + srcx;
- int dst_y_offset = srcy * dst->y_stride + srcx;
- int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
- int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-
- // If the side is not touching the bounder then don't extend.
- if (srcy)
- et = 0;
- if (srcx)
- el = 0;
- if (srcy + srch != src->y_height)
- eb = 0;
- if (srcx + srcw != src->y_width)
- er = 0;
-
- copy_and_extend_plane(src->y_buffer + src_y_offset,
- src->y_stride,
- dst->y_buffer + dst_y_offset,
- dst->y_stride,
- srch, srcw,
- et, el, eb, er);
-
- et = (et + 1) >> 1;
- el = (el + 1) >> 1;
- eb = (eb + 1) >> 1;
- er = (er + 1) >> 1;
- srch = (srch + 1) >> 1;
- srcw = (srcw + 1) >> 1;
-
- copy_and_extend_plane(src->u_buffer + src_uv_offset,
- src->uv_stride,
- dst->u_buffer + dst_uv_offset,
- dst->uv_stride,
- srch, srcw,
- et, el, eb, er);
-
- copy_and_extend_plane(src->v_buffer + src_uv_offset,
- src->uv_stride,
- dst->v_buffer + dst_uv_offset,
- dst->uv_stride,
- srch, srcw,
- et, el, eb, er);
-}
-
-/* note the extension is only for the last row, for intra prediction purpose */
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
- unsigned char *UPtr, unsigned char *VPtr) {
- int i;
-
- YPtr += ybf->y_stride * 14;
- UPtr += ybf->uv_stride * 6;
- VPtr += ybf->uv_stride * 6;
-
- for (i = 0; i < 4; i++) {
- YPtr[i] = YPtr[-1];
- UPtr[i] = UPtr[-1];
- VPtr[i] = VPtr[-1];
- }
-
- YPtr += ybf->y_stride;
- UPtr += ybf->uv_stride;
- VPtr += ybf->uv_stride;
-
- for (i = 0; i < 4; i++) {
- YPtr[i] = YPtr[-1];
- UPtr[i] = UPtr[-1];
- VPtr[i] = VPtr[-1];
- }
-}
--- a/vp8/common/extend.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_EXTEND_H
-#define __INC_EXTEND_H
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
- unsigned char *UPtr, unsigned char *VPtr);
-
-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst);
-
-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst,
- int srcy, int srcx,
- int srch, int srcw);
-
-#endif // __INC_EXTEND_H
--- a/vp8/common/filter.c
+++ /dev/null
@@ -1,1159 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "filter.h"
-#include "vpx_ports/mem.h"
-#include "vpx_rtcd.h"
-
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
- { 128, 0 },
- { 120, 8 },
- { 112, 16 },
- { 104, 24 },
- { 96, 32 },
- { 88, 40 },
- { 80, 48 },
- { 72, 56 },
- { 64, 64 },
- { 56, 72 },
- { 48, 80 },
- { 40, 88 },
- { 32, 96 },
- { 24, 104 },
- { 16, 112 },
- { 8, 120 }
-};
-
-#define FILTER_ALPHA 0
-#define FILTER_ALPHA_SHARP 1
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-#if FILTER_ALPHA == 0
- /* Lagrangian interpolation filter */
- { 0, 0, 0, 128, 0, 0, 0, 0},
- { 0, 1, -5, 126, 8, -3, 1, 0},
- { -1, 3, -10, 122, 18, -6, 2, 0},
- { -1, 4, -13, 118, 27, -9, 3, -1},
- { -1, 4, -16, 112, 37, -11, 4, -1},
- { -1, 5, -18, 105, 48, -14, 4, -1},
- { -1, 5, -19, 97, 58, -16, 5, -1},
- { -1, 6, -19, 88, 68, -18, 5, -1},
- { -1, 6, -19, 78, 78, -19, 6, -1},
- { -1, 5, -18, 68, 88, -19, 6, -1},
- { -1, 5, -16, 58, 97, -19, 5, -1},
- { -1, 4, -14, 48, 105, -18, 5, -1},
- { -1, 4, -11, 37, 112, -16, 4, -1},
- { -1, 3, -9, 27, 118, -13, 4, -1},
- { 0, 2, -6, 18, 122, -10, 3, -1},
- { 0, 1, -3, 8, 126, -5, 1, 0}
-#elif FILTER_ALPHA == 50
- /* Generated using MATLAB:
- * alpha = 0.5;
- * b=intfilt(8,4,alpha);
- * bi=round(128*b);
- * ba=flipud(reshape([bi 0], 8, 8));
- * disp(num2str(ba, '%d,'))
- */
- { 0, 0, 0, 128, 0, 0, 0, 0},
- { 0, 1, -5, 126, 8, -3, 1, 0},
- { 0, 2, -10, 122, 18, -6, 2, 0},
- { -1, 3, -13, 118, 27, -9, 3, 0},
- { -1, 4, -16, 112, 37, -11, 3, 0},
- { -1, 5, -17, 104, 48, -14, 4, -1},
- { -1, 5, -18, 96, 58, -16, 5, -1},
- { -1, 5, -19, 88, 68, -17, 5, -1},
- { -1, 5, -18, 78, 78, -18, 5, -1},
- { -1, 5, -17, 68, 88, -19, 5, -1},
- { -1, 5, -16, 58, 96, -18, 5, -1},
- { -1, 4, -14, 48, 104, -17, 5, -1},
- { 0, 3, -11, 37, 112, -16, 4, -1},
- { 0, 3, -9, 27, 118, -13, 3, -1},
- { 0, 2, -6, 18, 122, -10, 2, 0},
- { 0, 1, -3, 8, 126, -5, 1, 0}
-#endif /* FILTER_ALPHA */
-};
-
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
-#if FILTER_ALPHA_SHARP == 1
- /* dct based filter */
- {0, 0, 0, 128, 0, 0, 0, 0},
- {-1, 3, -7, 127, 8, -3, 1, 0},
- {-2, 5, -13, 125, 17, -6, 3, -1},
- {-3, 7, -17, 121, 27, -10, 5, -2},
- {-4, 9, -20, 115, 37, -13, 6, -2},
- {-4, 10, -23, 108, 48, -16, 8, -3},
- {-4, 10, -24, 100, 59, -19, 9, -3},
- {-4, 11, -24, 90, 70, -21, 10, -4},
- {-4, 11, -23, 80, 80, -23, 11, -4},
- {-4, 10, -21, 70, 90, -24, 11, -4},
- {-3, 9, -19, 59, 100, -24, 10, -4},
- {-3, 8, -16, 48, 108, -23, 10, -4},
- {-2, 6, -13, 37, 115, -20, 9, -4},
- {-2, 5, -10, 27, 121, -17, 7, -3},
- {-1, 3, -6, 17, 125, -13, 5, -2},
- {0, 1, -3, 8, 127, -7, 3, -1}
-#elif FILTER_ALPHA_SHARP == 75
- /* alpha = 0.75 */
- {0, 0, 0, 128, 0, 0, 0, 0},
- {-1, 2, -6, 126, 9, -3, 2, -1},
- {-1, 4, -11, 123, 18, -7, 3, -1},
- {-2, 6, -16, 119, 28, -10, 5, -2},
- {-2, 7, -19, 113, 38, -13, 6, -2},
- {-3, 8, -21, 106, 49, -16, 7, -2},
- {-3, 9, -22, 99, 59, -19, 8, -3},
- {-3, 9, -23, 90, 70, -21, 9, -3},
- {-3, 9, -22, 80, 80, -22, 9, -3},
- {-3, 9, -21, 70, 90, -23, 9, -3},
- {-3, 8, -19, 59, 99, -22, 9, -3},
- {-2, 7, -16, 49, 106, -21, 8, -3},
- {-2, 6, -13, 38, 113, -19, 7, -2},
- {-2, 5, -10, 28, 119, -16, 6, -2},
- {-1, 3, -7, 18, 123, -11, 4, -1},
- {-1, 2, -3, 9, 126, -6, 2, -1}
-#endif /* FILTER_ALPHA_SHARP */
-};
-
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
- {0, 0, 128, 0, 0, 0},
- {1, -5, 125, 8, -2, 1},
- {1, -8, 122, 17, -5, 1},
- {2, -11, 116, 27, -8, 2},
- {3, -14, 110, 37, -10, 2},
- {3, -15, 103, 47, -12, 2},
- {3, -16, 95, 57, -14, 3},
- {3, -16, 86, 67, -15, 3},
- {3, -16, 77, 77, -16, 3},
- {3, -15, 67, 86, -16, 3},
- {3, -14, 57, 95, -16, 3},
- {2, -12, 47, 103, -15, 3},
- {2, -10, 37, 110, -14, 3},
- {2, -8, 27, 116, -11, 2},
- {1, -5, 17, 122, -8, 1},
- {1, -2, 8, 125, -5, 1}
-};
-
-static void filter_block2d_first_pass_6(unsigned char *src_ptr,
- int *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- Temp = Temp >> VP9_FILTER_SHIFT;
-
- if (Temp < 0)
- Temp = 0;
- else if (Temp > 255)
- Temp = 255;
-
- output_ptr[j] = Temp;
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-static void filter_block2d_second_pass_6(int *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- /* Apply filter */
- Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- Temp = Temp >> VP9_FILTER_SHIFT;
-
- if (Temp < 0)
- Temp = 0;
- else if (Temp > 255)
- Temp = 255;
-
- output_ptr[j] = (unsigned char)Temp;
- src_ptr++;
- }
-
- /* Start next row */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_pitch;
- }
-}
-
-/*
- * The only functional difference between filter_block2d_second_pass()
- * and this function is that filter_block2d_second_pass() does a sixtap
- * filter on the input and stores it in the output. This function
- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
- * and then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_second_pass_avg_6(int *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- /* Apply filter */
- Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- Temp = Temp >> VP9_FILTER_SHIFT;
-
- if (Temp < 0)
- Temp = 0;
- else if (Temp > 255)
- Temp = 255;
-
- output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);
- src_ptr++;
- }
-
- /* Start next row */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_pitch;
- }
-}
-
-#define Interp_Extend 3
-static void filter_block2d_6(unsigned char *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- int output_pitch,
- const short *HFilter,
- const short *VFilter) {
- int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
- 3 + Interp_Extend * 2, 4, HFilter);
-
- /* then filter verticaly... */
- filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp9_sixtap_predict_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
-}
-
-/*
- * The difference between filter_block2d_6() and filter_block2d_avg_6 is
- * that filter_block2d_6() does a 6-tap filter and stores it in the output
- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
- * then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_avg_6(unsigned char *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- int output_pitch,
- const short *HFilter,
- const short *VFilter) {
- int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),
- FData, src_pixels_per_line, 1,
- 3 + Interp_Extend * 2, 4, HFilter);
-
- /* then filter verticaly... */
- filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
- output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-void vp9_sixtap_predict_avg_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter);
-}
-
-void vp9_sixtap_predict8x8_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
- int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
- 7 + Interp_Extend * 2, 8, HFilter);
-
-
- /* then filter verticaly... */
- filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg8x8_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
- int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
- 7 + Interp_Extend * 2, 8, HFilter);
-
- /* then filter verticaly... */
- filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-}
-
-void vp9_sixtap_predict8x4_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
- int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
- 3 + Interp_Extend * 2, 8, HFilter);
-
-
- /* then filter verticaly... */
- filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict16x16_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */
- int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
-
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
- 15 + Interp_Extend * 2, 16, HFilter);
-
- /* then filter verticaly... */
- filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg16x16_c
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-) {
- const short *HFilter;
- const short *VFilter;
- // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */
- int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
- /* then filter verticaly... */
- filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,
- 16, 16, 16, 16, VFilter);
-}
-
-typedef enum {
- VPX_FILTER_4x4 = 0,
- VPX_FILTER_8x8 = 1,
- VPX_FILTER_8x4 = 2,
- VPX_FILTER_16x16 = 3,
-} filter_size_t;
-
-static const unsigned int filter_size_to_wh[][2] = {
- {4, 4},
- {8, 8},
- {8, 4},
- {16,16},
-};
-
-static const unsigned int filter_max_height = 16;
-static const unsigned int filter_max_width = 16;
-
-static void filter_block2d_8_c(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *HFilter,
- const short *VFilter,
- const filter_size_t filter_size,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- const unsigned int output_width = filter_size_to_wh[filter_size][0];
- const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
- // Between passes, we use an intermediate buffer whose height is extended to
- // have enough horizontally filtered values as input for the vertical pass.
- // This buffer is allocated to be big enough for the largest block type we
- // support.
- const int kInterp_Extend = 4;
- const unsigned int intermediate_height =
- (kInterp_Extend - 1) + output_height + kInterp_Extend;
- const unsigned int max_intermediate_height =
- (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;
-#ifdef _MSC_VER
- // MSVC does not support C99 style declaration
- unsigned char intermediate_buffer[23 * 16];
-#else
- unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];
-#endif
- const int intermediate_next_stride = 1 - intermediate_height * output_width;
-
- // Horizontal pass (src -> transposed intermediate).
- {
- unsigned char *output_ptr = intermediate_buffer;
- const int src_next_row_stride = src_stride - output_width;
- unsigned int i, j;
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
- for (i = 0; i < intermediate_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply filter...
- int temp = ((int)src_ptr[0] * HFilter[0]) +
- ((int)src_ptr[1] * HFilter[1]) +
- ((int)src_ptr[2] * HFilter[2]) +
- ((int)src_ptr[3] * HFilter[3]) +
- ((int)src_ptr[4] * HFilter[4]) +
- ((int)src_ptr[5] * HFilter[5]) +
- ((int)src_ptr[6] * HFilter[6]) +
- ((int)src_ptr[7] * HFilter[7]) +
- (VP9_FILTER_WEIGHT >> 1); // Rounding
-
- // Normalize back to 0-255...
- temp >>= VP9_FILTER_SHIFT;
- if (temp < 0) {
- temp = 0;
- } else if (temp > 255) {
- temp = 255;
- }
- src_ptr++;
- *output_ptr = temp;
- output_ptr += intermediate_height;
- }
- src_ptr += src_next_row_stride;
- output_ptr += intermediate_next_stride;
- }
- }
-
- // Vertical pass (transposed intermediate -> dst).
- {
- unsigned char *src_ptr = intermediate_buffer;
- const int dst_next_row_stride = dst_stride - output_width;
- unsigned int i, j;
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply filter...
- int temp = ((int)src_ptr[0] * VFilter[0]) +
- ((int)src_ptr[1] * VFilter[1]) +
- ((int)src_ptr[2] * VFilter[2]) +
- ((int)src_ptr[3] * VFilter[3]) +
- ((int)src_ptr[4] * VFilter[4]) +
- ((int)src_ptr[5] * VFilter[5]) +
- ((int)src_ptr[6] * VFilter[6]) +
- ((int)src_ptr[7] * VFilter[7]) +
- (VP9_FILTER_WEIGHT >> 1); // Rounding
-
- // Normalize back to 0-255...
- temp >>= VP9_FILTER_SHIFT;
- if (temp < 0) {
- temp = 0;
- } else if (temp > 255) {
- temp = 255;
- }
-
- src_ptr += intermediate_height;
- *dst_ptr++ = (unsigned char)temp;
- }
- src_ptr += intermediate_next_stride;
- dst_ptr += dst_next_row_stride;
- }
- }
-}
-
-void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *HFilter_aligned16,
- const short *VFilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_4x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *HFilter_aligned16,
- const short *VFilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_8x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *HFilter_aligned16,
- const short *VFilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_8x8, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *HFilter_aligned16,
- const short *VFilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_16x16, dst_ptr, dst_stride);
-}
-
-static void block2d_average_c(unsigned char *src,
- unsigned int src_stride,
- unsigned char *output_ptr,
- unsigned int output_stride,
- const filter_size_t filter_size) {
- const unsigned int output_width = filter_size_to_wh[filter_size][0];
- const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
- unsigned int i, j;
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
- }
- output_ptr += output_stride;
- }
-}
-
-#define block2d_average block2d_average_c
-
-void vp9_eighttap_predict_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_sub_pel_filters_8[xoffset];
- VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
- unsigned char tmp[4 * 4];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 4);
- block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_sub_pel_filters_8s[xoffset];
- VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
- unsigned char tmp[4 * 4];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 4);
- block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict8x8_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg8x8_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- unsigned char tmp[8 * 8];
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 8);
- block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- unsigned char tmp[8 * 8];
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 8);
- block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict8x4_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg16x16_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
- const short *HFilter = vp9_sub_pel_filters_8[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 16);
- block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
- const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- tmp, 16);
- block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_first_pass
- *
- * INPUTS : UINT8 *src_ptr : Pointer to source block.
- * UINT32 src_stride : Stride of source block.
- * UINT32 height : Block height.
- * UINT32 width : Block width.
- * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
- * in the horizontal direction to produce the filtered output
- * block. Used to implement first-pass of 2-D separable filter.
- *
- * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_first_pass(unsigned char *src_ptr,
- unsigned short *dst_ptr,
- unsigned int src_stride,
- unsigned int height,
- unsigned int width,
- const short *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply bilinear filter */
- dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[1] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_stride - width;
- dst_ptr += width;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_second_pass
- *
- * INPUTS : INT32 *src_ptr : Pointer to source block.
- * UINT32 dst_pitch : Destination block pitch.
- * UINT32 height : Block height.
- * UINT32 width : Block width.
- * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
- * in the vertical direction to produce the filtered output
- * block. Used to implement second-pass of 2-D separable filter.
- *
- * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
- unsigned char *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply filter */
- Temp = ((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[width] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2);
- dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- }
-
- /* Next row... */
- dst_ptr += dst_pitch;
- }
-}
-
-/*
- * As before for filter_block2d_second_pass_avg(), the functional difference
- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
- * is that filter_block2d_bil_second_pass() does a bilinear filter on input
- * and stores the result in output; filter_block2d_bil_second_pass_avg(),
- * instead, does a bilinear filter on input, averages the resulting value
- * with the values already present in the output and stores the result of
- * that back into the output ((filter_result + dest + 1) >> 1).
- */
-static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,
- unsigned char *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply filter */
- Temp = ((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[width] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2);
- dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
- src_ptr++;
- }
-
- /* Next row... */
- dst_ptr += dst_pitch;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil
- *
- * INPUTS : UINT8 *src_ptr : Pointer to source block.
- * UINT32 src_pitch : Stride of source block.
- * UINT32 dst_pitch : Stride of destination block.
- * INT32 *HFilter : Array of 2 horizontal filter taps.
- * INT32 *VFilter : Array of 2 vertical filter taps.
- * INT32 Width : Block width
- * INT32 Height : Block height
- *
- * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : 2-D filters an input block by applying a 2-tap
- * bi-linear filter horizontally followed by a 2-tap
- * bi-linear filter vertically on the result.
- *
- * SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-static void filter_block2d_bil(unsigned char *src_ptr,
- unsigned char *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch,
- const short *HFilter,
- const short *VFilter,
- int Width,
- int Height) {
-
- unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
- /* then 1-D vertically... */
- filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-static void filter_block2d_bil_avg(unsigned char *src_ptr,
- unsigned char *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch,
- const short *HFilter,
- const short *VFilter,
- int Width,
- int Height) {
- unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
- /* then 1-D vertically... */
- filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-void vp9_bilinear_predict4x4_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict_avg4x4_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp9_bilinear_predict_avg8x8_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp9_bilinear_predict16x16_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
-
-void vp9_bilinear_predict_avg16x16_c(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- const short *HFilter;
- const short *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 16, 16);
-}
--- a/vp8/common/filter.h
+++ /dev/null
@@ -1,28 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef FILTER_H
-#define FILTER_H
-
-#include "vpx_config.h"
-#include "vpx_scale/yv12config.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP9_FILTER_WEIGHT 128
-#define VP9_FILTER_SHIFT 7
-
-#define SUBPEL_SHIFTS 16
-
-extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
-extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
-
-#endif // FILTER_H
--- a/vp8/common/findnearmv.c
+++ /dev/null
@@ -1,327 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "findnearmv.h"
-#include "vp8/common/sadmxn.h"
-#include <limits.h>
-
-const unsigned char vp9_mbsplit_offset[4][16] = {
- { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
-};
-
-static void lower_mv_precision(int_mv *mv, int usehp)
-{
- if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
- if (mv->as_mv.row & 1)
- mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
- if (mv->as_mv.col & 1)
- mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
- }
-}
-
-/* Predict motion vectors using those from already-decoded nearby blocks.
- Note that we only consider one 4x4 subblock from each candidate 16x16
- macroblock. */
-
-void vp9_find_near_mvs
-(
- MACROBLOCKD *xd,
- const MODE_INFO *here,
- const MODE_INFO *lf_here,
- int_mv *nearest,
- int_mv *nearby,
- int_mv *best_mv,
- int cnt[4],
- int refframe,
- int *ref_frame_sign_bias) {
- const MODE_INFO *above = here - xd->mode_info_stride;
- const MODE_INFO *left = here - 1;
- const MODE_INFO *aboveleft = above - 1;
- const MODE_INFO *third = NULL;
- int_mv near_mvs[4];
- int_mv *mv = near_mvs;
- int *cntx = cnt;
- enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
-
- /* Zero accumulators */
- mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
- cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
-
- /* Process above */
- if (above->mbmi.ref_frame != INTRA_FRAME) {
- if (above->mbmi.mv[0].as_int) {
- ++ mv;
- mv->as_int = above->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
- refframe, mv, ref_frame_sign_bias);
- ++cntx;
- }
- *cntx += 2;
- }
-
- /* Process left */
- if (left->mbmi.ref_frame != INTRA_FRAME) {
- if (left->mbmi.mv[0].as_int) {
- int_mv this_mv;
- this_mv.as_int = left->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
- refframe, &this_mv, ref_frame_sign_bias);
-
- if (this_mv.as_int != mv->as_int) {
- ++ mv;
- mv->as_int = this_mv.as_int;
- ++ cntx;
- }
- *cntx += 2;
- } else
- cnt[CNT_INTRA] += 2;
- }
- /* Process above left or the one from last frame */
- if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||
- (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
- if (aboveleft->mbmi.mv[0].as_int) {
- third = aboveleft;
- } else if (lf_here->mbmi.mv[0].as_int) {
- third = lf_here;
- }
- if (third) {
- int_mv this_mv;
- this_mv.as_int = third->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],
- refframe, &this_mv, ref_frame_sign_bias);
-
- if (this_mv.as_int != mv->as_int) {
- ++ mv;
- mv->as_int = this_mv.as_int;
- ++ cntx;
- }
- *cntx += 1;
- } else
- cnt[CNT_INTRA] += 1;
- }
-
- /* If we have three distinct MV's ... */
- if (cnt[CNT_SPLITMV]) {
- /* See if the third MV can be merged with NEAREST */
- if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
- cnt[CNT_NEAREST] += 1;
- }
-
- cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
- + (left->mbmi.mode == SPLITMV)) * 2
- + (
- lf_here->mbmi.mode == SPLITMV ||
- aboveleft->mbmi.mode == SPLITMV);
-
- /* Swap near and nearest if necessary */
- if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
- int tmp;
- tmp = cnt[CNT_NEAREST];
- cnt[CNT_NEAREST] = cnt[CNT_NEAR];
- cnt[CNT_NEAR] = tmp;
- tmp = near_mvs[CNT_NEAREST].as_int;
- near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
- near_mvs[CNT_NEAR].as_int = tmp;
- }
-
- /* Use near_mvs[0] to store the "best" MV */
- if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
- near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
-
- /* Set up return values */
- best_mv->as_int = near_mvs[0].as_int;
- nearest->as_int = near_mvs[CNT_NEAREST].as_int;
- nearby->as_int = near_mvs[CNT_NEAR].as_int;
-
- /* Make sure that the 1/8th bits of the Mvs are zero if high_precision
- * is not being used, by truncating the last bit towards 0
- */
- lower_mv_precision(best_mv, xd->allow_high_precision_mv);
- lower_mv_precision(nearest, xd->allow_high_precision_mv);
- lower_mv_precision(nearby, xd->allow_high_precision_mv);
-
- // TODO: move clamp outside findnearmv
- clamp_mv2(nearest, xd);
- clamp_mv2(nearby, xd);
- clamp_mv2(best_mv, xd);
-}
-
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
- vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]
- ) {
- p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];
- p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];
- p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];
- p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
- return p;
-}
-
-#if CONFIG_NEWBESTREFMV
-#define SP(x) (((x) & 7) << 1)
-unsigned int vp9_sad3x16_c(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
-}
-unsigned int vp9_sad16x3_c(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
-}
-
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
- unsigned char *ref_y_buffer,
- int ref_y_stride,
- int_mv *mvlist,
- int_mv *best_mv,
- int_mv *nearest,
- int_mv *near) {
- int i, j;
- unsigned char *above_src;
- unsigned char *left_src;
- unsigned char *above_ref;
- unsigned char *left_ref;
- int score;
- int sse;
- int ref_scores[MAX_MV_REFS] = {0};
- int_mv sorted_mvs[MAX_MV_REFS];
- int zero_seen = FALSE;
-
- // Default all to 0,0 if nothing else available
- best_mv->as_int = nearest->as_int = near->as_int = 0;
- vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
-
-#if CONFIG_SUBPELREFMV
- above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
- left_src = xd->dst.y_buffer - 2;
- above_ref = ref_y_buffer - ref_y_stride * 2;
- left_ref = ref_y_buffer - 2;
-#else
- above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
- left_src = xd->dst.y_buffer - 3;
- above_ref = ref_y_buffer - ref_y_stride * 3;
- left_ref = ref_y_buffer - 3;
-#endif
-
- //for(i = 0; i < MAX_MV_REFS; ++i) {
- // Limit search to the predicted best 4
- for(i = 0; i < 4; ++i) {
- int_mv this_mv;
- int offset = 0;
- int row_offset, col_offset;
-
- this_mv.as_int = mvlist[i].as_int;
-
- // If we see a 0,0 vector for a second time we have reached the end of
- // the list of valid candidate vectors.
- if (!this_mv.as_int && zero_seen)
- break;
-
- zero_seen = zero_seen || !this_mv.as_int;
-
- clamp_mv(&this_mv,
- xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
- xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
- xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
- xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-#if CONFIG_SUBPELREFMV
- row_offset = this_mv.as_mv.row >> 3;
- col_offset = this_mv.as_mv.col >> 3;
- offset = ref_y_stride * row_offset + col_offset;
- score = 0;
- if (xd->up_available) {
- vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- above_src, xd->dst.y_stride, &sse);
- score += sse;
- }
- if (xd->left_available) {
- vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- left_src, xd->dst.y_stride, &sse);
- score += sse;
- }
-#else
- row_offset = (this_mv.as_mv.row > 0) ?
- ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
- col_offset = (this_mv.as_mv.col > 0) ?
- ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
- offset = ref_y_stride * row_offset + col_offset;
- score = 0;
- if (xd->up_available) {
- score += vp9_sad16x3(above_src, xd->dst.y_stride,
- above_ref + offset, ref_y_stride, INT_MAX);
- }
- if (xd->left_available) {
- score += vp9_sad3x16(left_src, xd->dst.y_stride,
- left_ref + offset, ref_y_stride, INT_MAX);
- }
-#endif
- // Add the entry to our list and then resort the list on score.
- ref_scores[i] = score;
- sorted_mvs[i].as_int = this_mv.as_int;
- j = i;
- while (j > 0) {
- if (ref_scores[j] < ref_scores[j-1]) {
- ref_scores[j] = ref_scores[j-1];
- sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
- ref_scores[j-1] = score;
- sorted_mvs[j-1].as_int = this_mv.as_int;
- j--;
- } else
- break;
- }
- }
-
- // Make sure all the candidates are properly clamped etc
- for (i = 0; i < 4; ++i) {
- lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
- clamp_mv2(&sorted_mvs[i], xd);
- }
-
- // Set the best mv to the first entry in the sorted list
- best_mv->as_int = sorted_mvs[0].as_int;
-
- // Provided that there are non zero vectors available there will not
- // be more than one 0,0 entry in the sorted list.
- // The best ref mv is always set to the first entry (which gave the best
- // results. The nearest is set to the first non zero vector if available and
- // near to the second non zero vector if available.
- // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
- if ( sorted_mvs[0].as_int ) {
- nearest->as_int = sorted_mvs[0].as_int;
- if ( sorted_mvs[1].as_int )
- near->as_int = sorted_mvs[1].as_int;
- else
- near->as_int = sorted_mvs[2].as_int;
- } else {
- nearest->as_int = sorted_mvs[1].as_int;
- near->as_int = sorted_mvs[2].as_int;
- }
-
- // Copy back the re-ordered mv list
- vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
-}
-
-#endif // CONFIG_NEWBESTREFMV
--- a/vp8/common/findnearmv.h
+++ /dev/null
@@ -1,188 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_FINDNEARMV_H
-#define __INC_FINDNEARMV_H
-
-#include "mv.h"
-#include "blockd.h"
-#include "modecont.h"
-#include "treecoder.h"
-#include "onyxc_int.h"
-
-#if CONFIG_NEWBESTREFMV
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
- unsigned char *ref_y_buffer,
- int ref_y_stride,
- int_mv *mvlist,
- int_mv *best_mv,
- int_mv *nearest,
- int_mv *near);
-#endif
-
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {
- MV xmv;
- xmv = mvp->as_mv;
-
- if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
- xmv.row *= -1;
- xmv.col *= -1;
- }
-
- mvp->as_mv = xmv;
-}
-
-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
-
-static void clamp_mv(int_mv *mv,
- int mb_to_left_edge,
- int mb_to_right_edge,
- int mb_to_top_edge,
- int mb_to_bottom_edge) {
- mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
- mb_to_left_edge : mv->as_mv.col;
- mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
- mb_to_right_edge : mv->as_mv.col;
- mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
- mb_to_top_edge : mv->as_mv.row;
- mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
- mb_to_bottom_edge : mv->as_mv.row;
-}
-
-static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
- clamp_mv(mv,
- xd->mb_to_left_edge - LEFT_TOP_MARGIN,
- xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
- xd->mb_to_top_edge - LEFT_TOP_MARGIN,
- xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-}
-
-static unsigned int check_mv_bounds(int_mv *mv,
- int mb_to_left_edge,
- int mb_to_right_edge,
- int mb_to_top_edge,
- int mb_to_bottom_edge) {
- return (mv->as_mv.col < mb_to_left_edge) ||
- (mv->as_mv.col > mb_to_right_edge) ||
- (mv->as_mv.row < mb_to_top_edge) ||
- (mv->as_mv.row > mb_to_bottom_edge);
-}
-
-void vp9_find_near_mvs(MACROBLOCKD *xd,
- const MODE_INFO *here,
- const MODE_INFO *lfhere,
- int_mv *nearest, int_mv *nearby, int_mv *best,
- int near_mv_ref_cts[4],
- int refframe,
- int *ref_frame_sign_bias);
-
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
- vp9_prob p[VP9_MVREFS - 1],
- const int near_mv_ref_ct[4]);
-
-extern const unsigned char vp9_mbsplit_offset[4][16];
-
-static int left_block_mv(const MODE_INFO *cur_mb, int b) {
- if (!(b & 3)) {
- /* On L edge, get from MB to left of us */
- --cur_mb;
-
- if (cur_mb->mbmi.mode != SPLITMV)
- return cur_mb->mbmi.mv[0].as_int;
- b += 4;
- }
-
- return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
-}
-
-static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
- if (!(b & 3)) {
- /* On L edge, get from MB to left of us */
- --cur_mb;
-
- if (cur_mb->mbmi.mode != SPLITMV)
- return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
- b += 4;
- }
-
- return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;
-}
-
-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
- if (!(b >> 2)) {
- /* On top edge, get from MB above us */
- cur_mb -= mi_stride;
-
- if (cur_mb->mbmi.mode != SPLITMV)
- return cur_mb->mbmi.mv[0].as_int;
- b += 16;
- }
-
- return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
-}
-
-static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
- if (!(b >> 2)) {
- /* On top edge, get from MB above us */
- cur_mb -= mi_stride;
-
- if (cur_mb->mbmi.mode != SPLITMV)
- return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
- b += 16;
- }
-
- return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;
-}
-
-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
- if (!(b & 3)) {
- /* On L edge, get from MB to left of us */
- --cur_mb;
-
- if (cur_mb->mbmi.mode < I8X8_PRED) {
- return pred_mode_conv(cur_mb->mbmi.mode);
- } else if (cur_mb->mbmi.mode == I8X8_PRED) {
- return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);
- } else if (cur_mb->mbmi.mode == B_PRED) {
- return ((cur_mb->bmi + 3 + b)->as_mode.first);
- } else {
- return B_DC_PRED;
- }
- }
- return (cur_mb->bmi + b - 1)->as_mode.first;
-}
-
-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
- int b, int mi_stride) {
- if (!(b >> 2)) {
- /* On top edge, get from MB above us */
- cur_mb -= mi_stride;
-
- if (cur_mb->mbmi.mode < I8X8_PRED) {
- return pred_mode_conv(cur_mb->mbmi.mode);
- } else if (cur_mb->mbmi.mode == I8X8_PRED) {
- return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);
- } else if (cur_mb->mbmi.mode == B_PRED) {
- return ((cur_mb->bmi + 12 + b)->as_mode.first);
- } else {
- return B_DC_PRED;
- }
- }
-
- return (cur_mb->bmi + b - 4)->as_mode.first;
-}
-
-#endif
--- a/vp8/common/generic/systemdependent.c
+++ /dev/null
@@ -1,87 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/onyxc_int.h"
-
-extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);
-extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);
-
-void vp9_machine_specific_config(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
- VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
-
- rtcd->idct.idct1 = vp9_short_idct4x4llm_1_c;
- rtcd->idct.idct16 = vp9_short_idct4x4llm_c;
- rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
- rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
- rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
- rtcd->idct.idct8 = vp9_short_idct8x8_c;
- rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
- rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c;
- rtcd->idct.idct16x16 = vp9_short_idct16x16_c;
-
- rtcd->subpix.eighttap16x16 = vp9_eighttap_predict16x16_c;
- rtcd->subpix.eighttap8x8 = vp9_eighttap_predict8x8_c;
- rtcd->subpix.eighttap_avg16x16 = vp9_eighttap_predict_avg16x16_c;
- rtcd->subpix.eighttap_avg8x8 = vp9_eighttap_predict_avg8x8_c;
- rtcd->subpix.eighttap_avg4x4 = vp9_eighttap_predict_avg4x4_c;
- rtcd->subpix.eighttap8x4 = vp9_eighttap_predict8x4_c;
- rtcd->subpix.eighttap4x4 = vp9_eighttap_predict_c;
- rtcd->subpix.eighttap16x16_sharp = vp9_eighttap_predict16x16_sharp_c;
- rtcd->subpix.eighttap8x8_sharp = vp9_eighttap_predict8x8_sharp_c;
- rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;
- rtcd->subpix.eighttap_avg8x8_sharp = vp9_eighttap_predict_avg8x8_sharp_c;
- rtcd->subpix.eighttap_avg4x4_sharp = vp9_eighttap_predict_avg4x4_sharp_c;
- rtcd->subpix.eighttap8x4_sharp = vp9_eighttap_predict8x4_sharp_c;
- rtcd->subpix.eighttap4x4_sharp = vp9_eighttap_predict_sharp_c;
-
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_c;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_c;
- rtcd->subpix.sixtap_avg16x16 = vp9_sixtap_predict_avg16x16_c;
- rtcd->subpix.sixtap_avg8x8 = vp9_sixtap_predict_avg8x8_c;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_c;
- rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_c;
- rtcd->subpix.sixtap_avg4x4 = vp9_sixtap_predict_avg_c;
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_c;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_c;
- rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;
- rtcd->subpix.bilinear_avg8x8 = vp9_bilinear_predict_avg8x8_c;
- rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_c;
- rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_c;
- rtcd->subpix.bilinear_avg4x4 = vp9_bilinear_predict_avg4x4_c;
-
-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
- rtcd->postproc.down = vp9_mbpost_proc_down_c;
- rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;
- rtcd->postproc.downacross = vp9_post_proc_down_and_across_c;
- rtcd->postproc.addnoise = vp9_plane_add_noise_c;
- rtcd->postproc.blend_mb_inner = vp9_blend_mb_inner_c;
- rtcd->postproc.blend_mb_outer = vp9_blend_mb_outer_c;
- rtcd->postproc.blend_b = vp9_blend_b_c;
-#endif
-
-#endif
-
-#if ARCH_X86 || ARCH_X86_64
- vp9_arch_x86_common_init(ctx);
-#endif
-
-#if ARCH_ARM
- vp9_arch_arm_common_init(ctx);
-#endif
-
- vpx_rtcd();
-}
--- a/vp8/common/header.h
+++ /dev/null
@@ -1,42 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_HEADER_H
-#define __INC_HEADER_H
-
-/* 24 bits total */
-typedef struct {
- unsigned int type: 1;
- unsigned int version: 3;
- unsigned int show_frame: 1;
-
- /* Allow 2^20 bytes = 8 megabits for first partition */
-
- unsigned int first_partition_length_in_bytes: 19;
-
-#ifdef PACKET_TESTING
- unsigned int frame_number;
- unsigned int update_gold: 1;
- unsigned int uses_gold: 1;
- unsigned int update_last: 1;
- unsigned int uses_last: 1;
-#endif
-
-} VP9_HEADER;
-
-#ifdef PACKET_TESTING
-#define VP9_HEADER_SIZE 8
-#else
-#define VP9_HEADER_SIZE 3
-#endif
-
-
-#endif
--- a/vp8/common/idct.h
+++ /dev/null
@@ -1,144 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_IDCT_H
-#define __INC_IDCT_H
-
-#include "vp8/common/blockd.h"
-
-#define prototype_second_order(sym) \
- void sym(short *input, short *output)
-
-#define prototype_idct(sym) \
- void sym(short *input, short *output, int pitch)
-
-#define prototype_idct_scalar_add(sym) \
- void sym(short input, \
- unsigned char *pred, unsigned char *output, \
- int pitch, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/idct_x86.h"
-#endif
-
-#ifdef _MSC_VER
-/* TODO: remove these after integer implmementations are done */
-#define M_PI 3.14159265358979323846
-#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))
-#endif
-
-
-#if ARCH_ARM
-#include "arm/idct_arm.h"
-#endif
-
-#if CONFIG_LOSSLESS
-#define WHT_UPSCALE_FACTOR 3
-#define Y2_WHT_UPSCALE_FACTOR 2
-#endif
-
-#ifndef vp9_idct_idct16x16
-#define vp9_idct_idct16x16 vp9_short_idct16x16_c
-#endif
-extern prototype_idct(vp9_idct_idct16x16);
-
-#ifndef vp9_idct_idct8
-#define vp9_idct_idct8 vp9_short_idct8x8_c
-#endif
-extern prototype_idct(vp9_idct_idct8);
-
-#ifndef vp9_idct_idct8_1
-#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
-#endif
-extern prototype_idct(vp9_idct_idct8_1);
-
-#ifndef vp9_idct_ihaar2
-#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2);
-
-#ifndef vp9_idct_ihaar2_1
-#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2_1);
-
-#ifndef vp9_idct_idct1_scalar_add_8x8
-#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);
-
-
-
-#ifndef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c
-#endif
-extern prototype_idct(vp9_idct_idct1);
-
-#ifndef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_c
-#endif
-extern prototype_idct(vp9_idct_idct16);
-
-#ifndef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);
-
-
-#ifndef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh1);
-
-#ifndef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh16);
-
-#if CONFIG_LOSSLESS
-extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);
-extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);
-extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);
-#endif
-
-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
- TX_TYPE tx_type, int tx_dim);
-
-typedef prototype_idct((*vp9_idct_fn_t));
-typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));
-typedef prototype_second_order((*vp9_second_order_fn_t));
-
-typedef struct {
- vp9_idct_fn_t idct1;
- vp9_idct_fn_t idct16;
- vp9_idct_scalar_add_fn_t idct1_scalar_add;
-
- vp9_second_order_fn_t iwalsh1;
- vp9_second_order_fn_t iwalsh16;
-
- vp9_idct_fn_t idct8;
- vp9_idct_fn_t idct8_1;
- vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
- vp9_idct_fn_t ihaar2;
- vp9_idct_fn_t ihaar2_1;
-
- vp9_idct_fn_t idct16x16;
-} vp9_idct_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IDCT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn
-#endif
-
-#endif
--- a/vp8/common/idctllm.c
+++ /dev/null
@@ -1,1275 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- * Notes:
- *
- * This implementation makes use of 16 bit fixed point verio of two multiply
- * constants:
- * 1. sqrt(2) * cos (pi/8)
- * 2. sqrt(2) * sin (pi/8)
- * Becuase the first constant is bigger than 1, to maintain the same 16 bit
- * fixed point precision as the second one, we use a trick of
- * x * a = x + x*(a-1)
- * so
- * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
- **************************************************************************/
-#include <assert.h>
-#include <math.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/systemdependent.h"
-
-#include "vp8/common/blockd.h"
-
-static const int cospi8sqrt2minus1 = 20091;
-static const int sinpi8sqrt2 = 35468;
-static const int rounding = 0;
-
-// TODO: these transforms can be further converted into integer forms
-// for complexity optimization
-static const float idct_4[16] = {
- 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.270598050073099,
- 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.653281482438188,
- 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.653281482438188,
- 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.270598050073099
-};
-
-static const float iadst_4[16] = {
- 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.428525073124360,
- 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.656538502008139,
- 0.577350269189626, 0, -0.577350269189626, 0.577350269189626,
- 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.228013428883779
-};
-
-static const float idct_8[64] = {
- 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.415734806151273,
- 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.097545161008064,
- 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.097545161008064,
- -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.277785116509801,
- 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.490392640201615,
- -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.415734806151273,
- 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.277785116509801,
- 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.490392640201615,
- 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.277785116509801,
- 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.490392640201615,
- 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.490392640201615,
- -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.415734806151273,
- 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.097545161008064,
- -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.277785116509801,
- 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.415734806151273,
- 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.097545161008064
-};
-
-static const float iadst_8[64] = {
- 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.466553967085785,
- 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.175227946595735,
- 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.255357107325376,
- -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.326790388032145,
- 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.326790388032145,
- -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.434217976756762,
- 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.434217976756762,
- 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.483002021635509,
- 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.089131608307533,
- 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.466553967085785,
- 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.483002021635509,
- -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.387095214016348,
- 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.175227946595735,
- -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.255357107325376,
- 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.387095214016348,
- 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.089131608307532
-};
-
-static const int16_t idct_i4[16] = {
- 8192, 10703, 8192, 4433,
- 8192, 4433, -8192, -10703,
- 8192, -4433, -8192, 10703,
- 8192, -10703, 8192, -4433
-};
-
-static const int16_t iadst_i4[16] = {
- 3736, 9459, 10757, 7021,
- 7021, 9459, -3736, -10757,
- 9459, 0, -9459, 9459,
- 10757, -9459, 7021, -3736
-};
-
-static const int16_t idct_i8[64] = {
- 5793, 8035, 7568, 6811,
- 5793, 4551, 3135, 1598,
- 5793, 6811, 3135, -1598,
- -5793, -8035, -7568, -4551,
- 5793, 4551, -3135, -8035,
- -5793, 1598, 7568, 6811,
- 5793, 1598, -7568, -4551,
- 5793, 6811, -3135, -8035,
- 5793, -1598, -7568, 4551,
- 5793, -6811, -3135, 8035,
- 5793, -4551, -3135, 8035,
- -5793, -1598, 7568, -6811,
- 5793, -6811, 3135, 1598,
- -5793, 8035, -7568, 4551,
- 5793, -8035, 7568, -6811,
- 5793, -4551, 3135, -1598
-};
-
-static const int16_t iadst_i8[64] = {
- 1460, 4184, 6342, 7644,
- 7914, 7114, 5354, 2871,
- 2871, 7114, 7644, 4184,
- -1460, -6342, -7914, -5354,
- 4184, 7914, 2871, -5354,
- -7644, -1460, 6342, 7114,
- 5354, 6342, -4184, -7114,
- 2871, 7644, -1460, -7914,
- 6342, 2871, -7914, 1460,
- 7114, -5354, -4184, 7644,
- 7114, -1460, -5354, 7914,
- -4184, -2871, 7644, -6342,
- 7644, -5354, 1460, 2871,
- -6342, 7914, -7114, 4184,
- 7914, -7644, 7114, -6342,
- 5354, -4184, 2871, -1460
-};
-
-static float idct_16[256] = {
- 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0.273300,
- 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0.034654,
- 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0.166664,
- -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
- 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
- -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0.166664,
- 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0.034654,
- 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
- 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0.351851,
- 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0.273300,
- 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0.102631,
- -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0.311806,
- 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0.311806,
- -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0.338330,
- 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0.224292,
- 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0.351851,
- 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0.224292,
- 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0.351851,
- 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0.311806,
- -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0.338330,
- 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0.102631,
- -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0.311806,
- 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0.351851,
- 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0.273300,
- 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0.034654,
- 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0.224292,
- 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0.338330,
- -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0.166664,
- 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0.166664,
- -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0.102631,
- 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0.273300,
- 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0.034654
-};
-
-static float iadst_16[256] = {
- 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0.344612,
- 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0.065889,
- 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0.098087,
- -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
- 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0.316693,
- -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0.188227,
- 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
- 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
- 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0.263118,
- 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0.283599,
- 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0.263118,
- -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0.316693,
- 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0.188227,
- -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0.338341,
- 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0.316693,
- 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0.347761,
- 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0.098087,
- 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0.344612,
- 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0.344612,
- -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0.329007,
- 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000,
- -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511,
- 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0.344612,
- 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0.263118,
- 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0.098087,
- 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0.215215,
- 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0.316693,
- -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0.159534,
- 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0.188227,
- -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0.098087,
- 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0.263118,
- 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0.033094
-};
-
-static const int16_t idct_i16[256] = {
- 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478,
- 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568,
- 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731,
- -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
- 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543,
- -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731,
- 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568,
- 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675,
- 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765,
- 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478,
- 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682,
- -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109,
- 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109,
- -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543,
- 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675,
- 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765,
- 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675,
- 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765,
- 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109,
- -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543,
- 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682,
- -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109,
- 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765,
- 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478,
- 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568,
- 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675,
- 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543,
- -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731,
- 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731,
- -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682,
- 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478,
- 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568
-};
-
-static const int16_t iadst_i16[256] = {
- 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646,
- 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080,
- 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607,
- -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
- 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189,
- -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084,
- 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084,
- 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936,
- 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311,
- 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646,
- 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311,
- -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189,
- 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084,
- -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543,
- 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189,
- 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698,
- 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607,
- 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646,
- 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646,
- -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390,
- 4940, 0, -4940, 4940, 0, -4940, 4940, 0,
- -4940, 4940, 0, -4940, 4940, 0, -4940, 4940,
- 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646,
- 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311,
- 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607,
- 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526,
- 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,
- -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,
- 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,
- -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,
- 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,
- 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542
-};
-
-/* For test */
-#define TEST_INT 1
-#if TEST_INT
-#define vp9_ihtllm_int_c vp9_ihtllm_c
-#else
-#define vp9_ihtllm_float_c vp9_ihtllm_c
-#endif
-
-void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
- TX_TYPE tx_type, int tx_dim) {
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- int i, j, k;
- float bufa[256], bufb[256]; // buffers are for floating-point test purpose
- // the implementation could be simplified in
- // conjunction with integer transform
- const int16_t *ip = input;
- int16_t *op = output;
- int shortpitch = pitch >> 1;
-
- float *pfa = &bufa[0];
- float *pfb = &bufb[0];
-
- // pointers to vertical and horizontal transforms
- const float *ptv, *pth;
-
- assert(tx_type != DCT_DCT);
- // load and convert residual array into floating-point
- for(j = 0; j < tx_dim; j++) {
- for(i = 0; i < tx_dim; i++) {
- pfa[i] = (float)ip[i];
- }
- pfa += tx_dim;
- ip += tx_dim;
- }
-
- // vertical transformation
- pfa = &bufa[0];
- pfb = &bufb[0];
-
- switch(tx_type) {
- case ADST_ADST :
- case ADST_DCT :
- ptv = (tx_dim == 4) ? &iadst_4[0] :
- ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
- break;
-
- default :
- ptv = (tx_dim == 4) ? &idct_4[0] :
- ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
- break;
- }
-
- for(j = 0; j < tx_dim; j++) {
- for(i = 0; i < tx_dim; i++) {
- pfb[i] = 0 ;
- for(k = 0; k < tx_dim; k++) {
- pfb[i] += ptv[k] * pfa[(k * tx_dim)];
- }
- pfa += 1;
- }
-
- pfb += tx_dim;
- ptv += tx_dim;
- pfa = &bufa[0];
- }
-
- // horizontal transformation
- pfa = &bufa[0];
- pfb = &bufb[0];
-
- switch(tx_type) {
- case ADST_ADST :
- case DCT_ADST :
- pth = (tx_dim == 4) ? &iadst_4[0] :
- ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
- break;
-
- default :
- pth = (tx_dim == 4) ? &idct_4[0] :
- ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
- break;
- }
-
- for(j = 0; j < tx_dim; j++) {
- for(i = 0; i < tx_dim; i++) {
- pfa[i] = 0;
- for(k = 0; k < tx_dim; k++) {
- pfa[i] += pfb[k] * pth[k];
- }
- pth += tx_dim;
- }
-
- pfa += tx_dim;
- pfb += tx_dim;
-
- switch(tx_type) {
- case ADST_ADST :
- case DCT_ADST :
- pth = (tx_dim == 4) ? &iadst_4[0] :
- ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
- break;
-
- default :
- pth = (tx_dim == 4) ? &idct_4[0] :
- ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
- break;
- }
- }
-
- // convert to short integer format and load BLOCKD buffer
- op = output;
- pfa = &bufa[0];
-
- for(j = 0; j < tx_dim; j++) {
- for(i = 0; i < tx_dim; i++) {
- op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
- -(int16_t)( - pfa[i] / 8 + 0.49);
- }
-
- op += shortpitch;
- pfa += tx_dim;
- }
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-/* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 14 // 16
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 17 // 15
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,
- TX_TYPE tx_type, int tx_dim) {
- int i, j, k;
- int16_t imbuf[256];
-
- const int16_t *ip = input;
- int16_t *op = output;
- int16_t *im = &imbuf[0];
-
- /* pointers to vertical and horizontal transforms. */
- const int16_t *ptv = NULL, *pth = NULL;
- int shortpitch = pitch >> 1;
-
- switch (tx_type) {
- case ADST_ADST :
- ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
- : ((tx_dim == 8) ? &iadst_i8[0]
- : &iadst_i16[0]);
- break;
- case ADST_DCT :
- ptv = (tx_dim == 4) ? &iadst_i4[0]
- : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
- pth = (tx_dim == 4) ? &idct_i4[0]
- : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
- break;
- case DCT_ADST :
- ptv = (tx_dim == 4) ? &idct_i4[0]
- : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
- pth = (tx_dim == 4) ? &iadst_i4[0]
- : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
- break;
- case DCT_DCT :
- ptv = pth = (tx_dim == 4) ? &idct_i4[0]
- : ((tx_dim == 8) ? &idct_i8[0]
- : &idct_i16[0]);
- break;
- default:
- assert(0);
- break;
- }
-
- /* vertical transformation */
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- int temp = 0;
-
- for (k = 0; k < tx_dim; k++) {
- temp += ptv[k] * ip[(k * tx_dim)];
- }
-
- im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
- ip++;
- }
- im += tx_dim; // 16
- ptv += tx_dim;
- ip = input;
- }
-
- /* horizontal transformation */
- im = &imbuf[0];
-
- for (j = 0; j < tx_dim; j++) {
- const int16_t *pthc = pth;
-
- for (i = 0; i < tx_dim; i++) {
- int temp = 0;
-
- for (k = 0; k < tx_dim; k++) {
- temp += im[k] * pthc[k];
- }
-
- op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
- pthc += tx_dim;
- }
-
- im += tx_dim; // 16
- op += shortpitch;
- }
-}
-
-void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
-
- short *ip = input;
- short *op = output;
- int temp1, temp2;
- int shortpitch = pitch >> 1;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[8];
- b1 = ip[0] - ip[8];
-
- temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
- temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
- c1 = temp1 - temp2;
-
- temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
- temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
- d1 = temp1 + temp2;
-
- op[shortpitch * 0] = a1 + d1;
- op[shortpitch * 3] = a1 - d1;
-
- op[shortpitch * 1] = b1 + c1;
- op[shortpitch * 2] = b1 - c1;
-
- ip++;
- op++;
- }
-
- ip = output;
- op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[2];
- b1 = ip[0] - ip[2];
-
- temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
- temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
- c1 = temp1 - temp2;
-
- temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
- temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
- d1 = temp1 + temp2;
-
- op[0] = (a1 + d1 + 16) >> 5;
- op[3] = (a1 - d1 + 16) >> 5;
-
- op[1] = (b1 + c1 + 16) >> 5;
- op[2] = (b1 - c1 + 16) >> 5;
-
- ip += shortpitch;
- op += shortpitch;
- }
-}
-
-void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
- int i;
- int a1;
- short *op = output;
- int shortpitch = pitch >> 1;
- a1 = ((input[0] + 16) >> 5);
- for (i = 0; i < 4; i++) {
- op[0] = a1;
- op[1] = a1;
- op[2] = a1;
- op[3] = a1;
- op += shortpitch;
- }
-}
-
-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
- unsigned char *dst_ptr, int pitch, int stride) {
- int a1 = ((input_dc + 16) >> 5);
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = a1 + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- pred_ptr += pitch;
- }
-}
-
-void vp9_short_inv_walsh4x4_c(short *input, short *output) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ((ip[0] + ip[3]));
- b1 = ((ip[1] + ip[2]));
- c1 = ((ip[1] - ip[2]));
- d1 = ((ip[0] - ip[3]));
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[1] = (c1 + d1) >> 1;
- op[2] = (a1 - b1) >> 1;
- op[3] = (d1 - c1) >> 1;
-
- ip += 4;
- op += 4;
- }
-
- ip = output;
- op = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[12];
- b1 = ip[4] + ip[8];
- c1 = ip[4] - ip[8];
- d1 = ip[0] - ip[12];
- op[0] = (a1 + b1 + 1) >> 1;
- op[4] = (c1 + d1) >> 1;
- op[8] = (a1 - b1) >> 1;
- op[12] = (d1 - c1) >> 1;
- ip++;
- op++;
- }
-}
-
-void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
- int i;
- short tmp[4];
- short *ip = in;
- short *op = tmp;
-
- op[0] = (ip[0] + 1) >> 1;
- op[1] = op[2] = op[3] = (ip[0] >> 1);
-
- ip = tmp;
- op = out;
- for (i = 0; i < 4; i++) {
- op[0] = (ip[0] + 1) >> 1;
- op[4] = op[8] = op[12] = (ip[0] >> 1);
- ip++;
- op++;
- }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
- b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
- c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
- d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[1] = (c1 + d1) >> 1;
- op[2] = (a1 - b1) >> 1;
- op[3] = (d1 - c1) >> 1;
-
- ip += 4;
- op += 4;
- }
-
- ip = output;
- op = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[12];
- b1 = ip[4] + ip[8];
- c1 = ip[4] - ip[8];
- d1 = ip[0] - ip[12];
-
-
- op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-
- ip++;
- op++;
- }
-}
-
-void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
- int i;
- short tmp[4];
- short *ip = in;
- short *op = tmp;
-
- op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
- op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
-
- ip = tmp;
- op = out;
- for (i = 0; i < 4; i++) {
- op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
- ip++;
- op++;
- }
-}
-
-void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
- int shortpitch = pitch >> 1;
-
- for (i = 0; i < 4; i++) {
- a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
- b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
- c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
- d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[1] = (c1 + d1) >> 1;
- op[2] = (a1 - b1) >> 1;
- op[3] = (d1 - c1) >> 1;
-
- ip += 4;
- op += shortpitch;
- }
-
- ip = output;
- op = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
- b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
- c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
- d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
-
-
- op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
- op[shortpitch * 1] = (c1 + d1) >> 1;
- op[shortpitch * 2] = (a1 - b1) >> 1;
- op[shortpitch * 3] = (d1 - c1) >> 1;
-
- ip++;
- op++;
- }
-}
-
-void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
- int i;
- short tmp[4];
- short *ip = in;
- short *op = tmp;
- int shortpitch = pitch >> 1;
-
- op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
- op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
-
-
- ip = tmp;
- op = out;
- for (i = 0; i < 4; i++) {
- op[shortpitch * 0] = (ip[0] + 1) >> 1;
- op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
- ip++;
- op++;
- }
-}
-
-void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
- unsigned char *dst_ptr,
- int pitch, int stride) {
- int r, c;
- short tmp[16];
- vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = tmp[r * 4 + c] + pred_ptr[c];
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- pred_ptr += pitch;
- }
-}
-#endif
-
-void vp9_dc_only_idct_add_8x8_c(short input_dc,
- unsigned char *pred_ptr,
- unsigned char *dst_ptr,
- int pitch, int stride) {
- int a1 = ((input_dc + 16) >> 5);
- int r, c, b;
- unsigned char *orig_pred = pred_ptr;
- unsigned char *orig_dst = dst_ptr;
- for (b = 0; b < 4; b++) {
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = a1 + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- pred_ptr += pitch;
- }
- dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
- pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
- }
-}
-
-#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
-#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
-#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
-#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
-#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
-#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
-
-/* row (horizontal) IDCT
- *
- * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- *
- * ( k + - ) * l ) l=0 8 2
- *
- * where: c[0] = 128 c[1..7] = 128*sqrt(2) */
-
-static void idctrow(int *blk) {
- int x0, x1, x2, x3, x4, x5, x6, x7, x8;
- /* shortcut */
- if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
- (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
- blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
- = blk[5] = blk[6] = blk[7] = blk[0] << 3;
- return;
- }
-
- x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
- /* first stage */
- x8 = W7 * (x4 + x5);
- x4 = x8 + (W1 - W7) * x4;
- x5 = x8 - (W1 + W7) * x5;
- x8 = W3 * (x6 + x7);
- x6 = x8 - (W3 - W5) * x6;
- x7 = x8 - (W3 + W5) * x7;
-
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2);
- x2 = x1 - (W2 + W6) * x2;
- x3 = x1 + (W2 - W6) * x3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
-
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
-
- /* fourth stage */
- blk[0] = (x7 + x1) >> 8;
- blk[1] = (x3 + x2) >> 8;
- blk[2] = (x0 + x4) >> 8;
- blk[3] = (x8 + x6) >> 8;
- blk[4] = (x8 - x6) >> 8;
- blk[5] = (x0 - x4) >> 8;
- blk[6] = (x3 - x2) >> 8;
- blk[7] = (x7 - x1) >> 8;
-}
-
-/* column (vertical) IDCT
- *
- * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] *
- * cos( -- * ( k + - ) * l ) l=0 8 2
- *
- * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
-static void idctcol(int *blk) {
- int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
- /* shortcut */
- if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
- (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
- (x7 = blk[8 * 3]))) {
- blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
- = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
- = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
- return;
- }
-
- x0 = (blk[8 * 0] << 8) + 16384;
-
- /* first stage */
- x8 = W7 * (x4 + x5) + 4;
- x4 = (x8 + (W1 - W7) * x4) >> 3;
- x5 = (x8 - (W1 + W7) * x5) >> 3;
- x8 = W3 * (x6 + x7) + 4;
- x6 = (x8 - (W3 - W5) * x6) >> 3;
- x7 = (x8 - (W3 + W5) * x7) >> 3;
-
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2) + 4;
- x2 = (x1 - (W2 + W6) * x2) >> 3;
- x3 = (x1 + (W2 - W6) * x3) >> 3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
-
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
-
- /* fourth stage */
- blk[8 * 0] = (x7 + x1) >> 14;
- blk[8 * 1] = (x3 + x2) >> 14;
- blk[8 * 2] = (x0 + x4) >> 14;
- blk[8 * 3] = (x8 + x6) >> 14;
- blk[8 * 4] = (x8 - x6) >> 14;
- blk[8 * 5] = (x0 - x4) >> 14;
- blk[8 * 6] = (x3 - x2) >> 14;
- blk[8 * 7] = (x7 - x1) >> 14;
-}
-
-#define TX_DIM 8
-void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
- int X[TX_DIM * TX_DIM];
- int i, j;
- int shortpitch = pitch >> 1;
-
- for (i = 0; i < TX_DIM; i++) {
- for (j = 0; j < TX_DIM; j++) {
- X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
- + (coefs[i * TX_DIM + j] < 0)) >> 2;
- }
- }
- for (i = 0; i < 8; i++)
- idctrow(X + 8 * i);
-
- for (i = 0; i < 8; i++)
- idctcol(X + i);
-
- for (i = 0; i < TX_DIM; i++) {
- for (j = 0; j < TX_DIM; j++) {
- block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
- }
- }
-}
-
-
-void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
- int i;
- short *ip = input; // 0,1, 4, 8
- short *op = output;
- for (i = 0; i < 16; i++) {
- op[i] = 0;
- }
-
- op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
- op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
- op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
- op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
-}
-
-
-#if 0
-// Keep a really bad float version as reference for now.
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
-
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- double x;
- const int short_pitch = pitch >> 1;
- int i, j, k, l;
- for (l = 0; l < 16; ++l) {
- for (k = 0; k < 16; ++k) {
- double s = 0;
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) {
- x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
- if (i != 0)
- x *= sqrt(2.0);
- if (j != 0)
- x *= sqrt(2.0);
- s += x;
- }
- }
- output[k*short_pitch+l] = (short)round(s);
- }
- }
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-#endif
-
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
-
-
-static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
-
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- double step[16];
- double intermediate[16];
- double temp1, temp2;
-
-
- // step 1 and 2
- step[ 0] = input[0] + input[8];
- step[ 1] = input[0] - input[8];
-
- temp1 = input[4]*C12;
- temp2 = input[12]*C4;
-
- temp1 -= temp2;
- temp1 *= C8;
-
- step[ 2] = 2*(temp1);
-
- temp1 = input[4]*C4;
- temp2 = input[12]*C12;
- temp1 += temp2;
- temp1 = (temp1);
- temp1 *= C8;
- step[ 3] = 2*(temp1);
-
- temp1 = input[2]*C8;
- temp1 = 2*(temp1);
- temp2 = input[6] + input[10];
-
- step[ 4] = temp1 + temp2;
- step[ 5] = temp1 - temp2;
-
- temp1 = input[14]*C8;
- temp1 = 2*(temp1);
- temp2 = input[6] - input[10];
-
- step[ 6] = temp2 - temp1;
- step[ 7] = temp2 + temp1;
-
- // for odd input
- temp1 = input[3]*C12;
- temp2 = input[13]*C4;
- temp1 += temp2;
- temp1 = (temp1);
- temp1 *= C8;
- intermediate[ 8] = 2*(temp1);
-
- temp1 = input[3]*C4;
- temp2 = input[13]*C12;
- temp2 -= temp1;
- temp2 = (temp2);
- temp2 *= C8;
- intermediate[ 9] = 2*(temp2);
-
- intermediate[10] = 2*(input[9]*C8);
- intermediate[11] = input[15] - input[1];
- intermediate[12] = input[15] + input[1];
- intermediate[13] = 2*((input[7]*C8));
-
- temp1 = input[11]*C12;
- temp2 = input[5]*C4;
- temp2 -= temp1;
- temp2 = (temp2);
- temp2 *= C8;
- intermediate[14] = 2*(temp2);
-
- temp1 = input[11]*C4;
- temp2 = input[5]*C12;
- temp1 += temp2;
- temp1 = (temp1);
- temp1 *= C8;
- intermediate[15] = 2*(temp1);
-
- step[ 8] = intermediate[ 8] + intermediate[14];
- step[ 9] = intermediate[ 9] + intermediate[15];
- step[10] = intermediate[10] + intermediate[11];
- step[11] = intermediate[10] - intermediate[11];
- step[12] = intermediate[12] + intermediate[13];
- step[13] = intermediate[12] - intermediate[13];
- step[14] = intermediate[ 8] - intermediate[14];
- step[15] = intermediate[ 9] - intermediate[15];
-
- // step 3
- output[0] = step[ 0] + step[ 3];
- output[1] = step[ 1] + step[ 2];
- output[2] = step[ 1] - step[ 2];
- output[3] = step[ 0] - step[ 3];
-
- temp1 = step[ 4]*C14;
- temp2 = step[ 7]*C2;
- temp1 -= temp2;
- output[4] = (temp1);
-
- temp1 = step[ 4]*C2;
- temp2 = step[ 7]*C14;
- temp1 += temp2;
- output[7] = (temp1);
-
- temp1 = step[ 5]*C10;
- temp2 = step[ 6]*C6;
- temp1 -= temp2;
- output[5] = (temp1);
-
- temp1 = step[ 5]*C6;
- temp2 = step[ 6]*C10;
- temp1 += temp2;
- output[6] = (temp1);
-
- output[8] = step[ 8] + step[11];
- output[9] = step[ 9] + step[10];
- output[10] = step[ 9] - step[10];
- output[11] = step[ 8] - step[11];
- output[12] = step[12] + step[15];
- output[13] = step[13] + step[14];
- output[14] = step[13] - step[14];
- output[15] = step[12] - step[15];
-
- // output 4
- step[ 0] = output[0] + output[7];
- step[ 1] = output[1] + output[6];
- step[ 2] = output[2] + output[5];
- step[ 3] = output[3] + output[4];
- step[ 4] = output[3] - output[4];
- step[ 5] = output[2] - output[5];
- step[ 6] = output[1] - output[6];
- step[ 7] = output[0] - output[7];
-
- temp1 = output[8]*C7;
- temp2 = output[15]*C9;
- temp1 -= temp2;
- step[ 8] = (temp1);
-
- temp1 = output[9]*C11;
- temp2 = output[14]*C5;
- temp1 += temp2;
- step[ 9] = (temp1);
-
- temp1 = output[10]*C3;
- temp2 = output[13]*C13;
- temp1 -= temp2;
- step[10] = (temp1);
-
- temp1 = output[11]*C15;
- temp2 = output[12]*C1;
- temp1 += temp2;
- step[11] = (temp1);
-
- temp1 = output[11]*C1;
- temp2 = output[12]*C15;
- temp2 -= temp1;
- step[12] = (temp2);
-
- temp1 = output[10]*C13;
- temp2 = output[13]*C3;
- temp1 += temp2;
- step[13] = (temp1);
-
- temp1 = output[9]*C5;
- temp2 = output[14]*C11;
- temp2 -= temp1;
- step[14] = (temp2);
-
- temp1 = output[8]*C9;
- temp2 = output[15]*C7;
- temp1 += temp2;
- step[15] = (temp1);
-
- // step 5
- output[0] = (step[0] + step[15]);
- output[1] = (step[1] + step[14]);
- output[2] = (step[2] + step[13]);
- output[3] = (step[3] + step[12]);
- output[4] = (step[4] + step[11]);
- output[5] = (step[5] + step[10]);
- output[6] = (step[6] + step[ 9]);
- output[7] = (step[7] + step[ 8]);
-
- output[15] = (step[0] - step[15]);
- output[14] = (step[1] - step[14]);
- output[13] = (step[2] - step[13]);
- output[12] = (step[3] - step[12]);
- output[11] = (step[4] - step[11]);
- output[10] = (step[5] - step[10]);
- output[9] = (step[6] - step[ 9]);
- output[8] = (step[7] - step[ 8]);
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-// Remove once an int version of iDCT is written
-#if 0
-void reference_16x16_idct_1d(double input[16], double output[16]) {
-
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- const double kPi = 3.141592653589793238462643383279502884;
- const double kSqrt2 = 1.414213562373095048801688724209698;
- for (int k = 0; k < 16; k++) {
- output[k] = 0.0;
- for (int n = 0; n < 16; n++) {
- output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
- if (n == 0)
- output[k] = output[k]/kSqrt2;
- }
- }
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-#endif
-
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
-
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- double out[16*16], out2[16*16];
- const int short_pitch = pitch >> 1;
- int i, j;
- // First transform rows
- for (i = 0; i < 16; ++i) {
- double temp_in[16], temp_out[16];
- for (j = 0; j < 16; ++j)
- temp_in[j] = input[j + i*short_pitch];
- butterfly_16x16_idct_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- out[j + i*16] = temp_out[j];
- }
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- double temp_in[16], temp_out[16];
- for (j = 0; j < 16; ++j)
- temp_in[j] = out[j*16 + i];
- butterfly_16x16_idct_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- out2[j*16 + i] = temp_out[j];
- }
- for (i = 0; i < 16*16; ++i)
- output[i] = round(out2[i]/128);
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
--- a/vp8/common/implicit_segmentation.c
+++ /dev/null
@@ -1,255 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/onyxc_int.h"
-
-#define MAX_REGIONS 24000
-#ifndef NULL
-#define NULL 0
-#endif
-
-#define min_mbs_in_region 3
-
-// this linked list structure holds equivalences for connected
-// component labeling
-struct list_el {
- int label;
- int seg_value;
- int count;
- struct list_el *next;
-};
-typedef struct list_el item;
-
-// connected colorsegments
-typedef struct {
- int min_x;
- int min_y;
- int max_x;
- int max_y;
- long long sum_x;
- long long sum_y;
- int pixels;
- int seg_value;
- int label;
-} segment_info;
-
-
-typedef enum {
- SEGMENT_MODE,
- SEGMENT_MV,
- SEGMENT_REFFRAME,
- SEGMENT_SKIPPED
-} SEGMENT_TYPE;
-
-
-// this merges the two equivalence lists and
-// then makes sure that every label points to the same
-// equivalence list
-void merge(item *labels, int u, int v) {
- item *a = labels[u].next;
- item *b = labels[v].next;
- item c;
- item *it = &c;
- int count;
-
- // check if they are already merged
- if (u == v || a == b)
- return;
-
- count = a->count + b->count;
-
- // merge 2 sorted linked lists.
- while (a != NULL && b != NULL) {
- if (a->label < b->label) {
- it->next = a;
- a = a->next;
- } else {
- it->next = b;
- b = b->next;
- }
-
- it = it->next;
- }
-
- if (a == NULL)
- it->next = b;
- else
- it->next = a;
-
- it = c.next;
-
- // make sure every equivalence in the linked list points to this new ll
- while (it != NULL) {
- labels[it->label].next = c.next;
- it = it->next;
- }
- c.next->count = count;
-
-}
-
-void segment_via_mode_info(VP9_COMMON *oci, int how) {
- MODE_INFO *mi = oci->mi;
- int i, j;
- int mb_index = 0;
-
- int label = 1;
- int pitch = oci->mb_cols;
-
- // holds linked list equivalences
- // the max should probably be allocated at a higher level in oci
- item equivalences[MAX_REGIONS];
- int eq_ptr = 0;
- item labels[MAX_REGIONS];
- segment_info segments[MAX_REGIONS];
- int label_count = 1;
- int labeling[400 * 300];
- int *lp = labeling;
-
- label_count = 1;
- memset(labels, 0, sizeof(labels));
- memset(segments, 0, sizeof(segments));
-
- /* Go through each macroblock first pass labelling */
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- // int above seg_value, left seg_value, this seg_value...
- int a = -1, l = -1, n = -1;
-
- // above label, left label
- int al = -1, ll = -1;
- if (i) {
- al = lp[j - pitch];
- a = labels[al].next->seg_value;
- }
- if (j) {
- ll = lp[j - 1];
- l = labels[ll].next->seg_value;
- }
-
- // what setting are we going to do the implicit segmentation on
- switch (how) {
- case SEGMENT_MODE:
- n = mi[mb_index].mbmi.mode;
- break;
- case SEGMENT_MV:
- n = mi[mb_index].mbmi.mv[0].as_int;
- if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)
- n = -9999999;
- break;
- case SEGMENT_REFFRAME:
- n = mi[mb_index].mbmi.ref_frame;
- break;
- case SEGMENT_SKIPPED:
- n = mi[mb_index].mbmi.mb_skip_coeff;
- break;
- }
-
- // above and left both have the same seg_value
- if (n == a && n == l) {
- // pick the lowest label
- lp[j] = (al < ll ? al : ll);
- labels[lp[j]].next->count++;
-
- // merge the above and left equivalencies
- merge(labels, al, ll);
- }
- // this matches above seg_value
- else if (n == a) {
- // give it the same label as above
- lp[j] = al;
- labels[al].next->count++;
- }
- // this matches left seg_value
- else if (n == l) {
- // give it the same label as above
- lp[j] = ll;
- labels[ll].next->count++;
- } else {
- // new label doesn't match either
- item *e = &labels[label];
- item *nl = &equivalences[eq_ptr++];
- lp[j] = label;
- nl->label = label;
- nl->next = 0;
- nl->seg_value = n;
- nl->count = 1;
- e->next = nl;
- label++;
- }
- mb_index++;
- }
- mb_index++;
- }
- lp = labeling;
-
- // give new labels to regions
- for (i = 1; i < label; i++)
- if (labels[i].next->count > min_mbs_in_region && labels[labels[i].next->label].label == 0) {
- segment_info *cs = &segments[label_count];
- cs->label = label_count;
- labels[labels[i].next->label].label = label_count++;
- labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
- cs->seg_value = labels[labels[i].next->label].seg_value;
- cs->min_x = oci->mb_cols;
- cs->min_y = oci->mb_rows;
- cs->max_x = 0;
- cs->max_y = 0;
- cs->sum_x = 0;
- cs->sum_y = 0;
- cs->pixels = 0;
-
- }
- lp = labeling;
-
- // this is just to gather stats...
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- segment_info *cs;
- int oldlab = labels[lp[j]].next->label;
- int lab = labels[oldlab].label;
- lp[j] = lab;
-
- cs = &segments[lab];
-
- cs->min_x = (j < cs->min_x ? j : cs->min_x);
- cs->max_x = (j > cs->max_x ? j : cs->max_x);
- cs->min_y = (i < cs->min_y ? i : cs->min_y);
- cs->max_y = (i > cs->max_y ? i : cs->max_y);
- cs->sum_x += j;
- cs->sum_y += i;
- cs->pixels++;
-
- lp[j] = lab;
- mb_index++;
- }
- mb_index++;
- }
-
- {
- lp = labeling;
- printf("labelling \n");
- mb_index = 0;
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- printf("%4d", lp[j]);
- }
- printf(" ");
- for (j = 0; j < oci->mb_cols; j++, mb_index++) {
- // printf("%3d",mi[mb_index].mbmi.mode );
- printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
- mi[mb_index].mbmi.mv[0].as_mv.col);
- }
- printf("\n");
- ++mb_index;
- }
- printf("\n");
- }
-}
-
--- a/vp8/common/invtrans.c
+++ /dev/null
@@ -1,135 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "invtrans.h"
-
-static void recon_dcblock(MACROBLOCKD *xd) {
- BLOCKD *b = &xd->block[24];
- int i;
-
- for (i = 0; i < 16; i++) {
- xd->block[i].dqcoeff[0] = b->diff[i];
- }
-}
-
-static void recon_dcblock_8x8(MACROBLOCKD *xd) {
- BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10
-
- xd->block[0].dqcoeff[0] = b->diff[0];
- xd->block[4].dqcoeff[0] = b->diff[1];
- xd->block[8].dqcoeff[0] = b->diff[4];
- xd->block[12].dqcoeff[0] = b->diff[8];
-}
-
-void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- BLOCKD *b, int pitch) {
- if (b->eob <= 1)
- IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
- else
- IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
-}
-
-void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- int i;
- BLOCKD *blockd = xd->block;
-
- if (xd->mode_info_context->mbmi.mode != SPLITMV) {
- /* do 2nd order transform on the dc block */
- IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);
- recon_dcblock(xd);
- }
-
- for (i = 0; i < 16; i++) {
- vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);
- }
-}
-
-void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- int i;
- BLOCKD *blockd = xd->block;
-
- for (i = 16; i < 24; i++) {
- vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);
- }
-}
-
-void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- vp9_inverse_transform_mby_4x4(rtcd, xd);
- vp9_inverse_transform_mbuv_4x4(rtcd, xd);
-}
-
-void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- short *input_dqcoeff, short *output_coeff,
- int pitch) {
- // int b,i;
- // if (b->eob > 1)
- IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
- // else
- // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
-}
-
-void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- int i;
- BLOCKD *blockd = xd->block;
-
- if (xd->mode_info_context->mbmi.mode != SPLITMV) {
- // do 2nd order transform on the dc block
- IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);
- recon_dcblock_8x8(xd); // need to change for 8x8
- }
-
- for (i = 0; i < 9; i += 8) {
- vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
- &blockd[i].diff[0], 32);
- }
- for (i = 2; i < 11; i += 8) {
- vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],
- &blockd[i].diff[0], 32);
- }
-}
-
-void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- int i;
- BLOCKD *blockd = xd->block;
-
- for (i = 16; i < 24; i += 4) {
- vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
- &blockd[i].diff[0], 16);
- }
-}
-
-void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- vp9_inverse_transform_mby_8x8(rtcd, xd);
- vp9_inverse_transform_mbuv_8x8(rtcd, xd);
-}
-
-void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- short *input_dqcoeff,
- short *output_coeff, int pitch) {
- IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],
- &xd->block[0].diff[0], 32);
-}
-
-void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd) {
- vp9_inverse_transform_mby_16x16(rtcd, xd);
- vp9_inverse_transform_mbuv_8x8(rtcd, xd);
-}
--- a/vp8/common/invtrans.h
+++ /dev/null
@@ -1,53 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_INVTRANS_H
-#define __INC_INVTRANS_H
-
-#include "vpx_ports/config.h"
-#include "idct.h"
-#include "blockd.h"
-
-extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- BLOCKD *b, int pitch);
-
-extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- short *input_dqcoeff,
- short *output_coeff, int pitch);
-
-extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- short *input_dqcoeff,
- short *output_coeff, int pitch);
-
-extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
- MACROBLOCKD *xd);
-
-#endif // __INC_INVTRANS_H
--- a/vp8/common/loopfilter.c
+++ /dev/null
@@ -1,524 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "loopfilter.h"
-#include "onyxc_int.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/common/seg_common.h"
-
-static void lf_init_lut(loop_filter_info_n *lfi) {
- int filt_lvl;
-
- for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {
- if (filt_lvl >= 40) {
- lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
- lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
- } else if (filt_lvl >= 20) {
- lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
- lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
- } else if (filt_lvl >= 15) {
- lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
- lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
- } else {
- lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
- lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
- }
- }
-
- lfi->mode_lf_lut[DC_PRED] = 1;
- lfi->mode_lf_lut[D45_PRED] = 1;
- lfi->mode_lf_lut[D135_PRED] = 1;
- lfi->mode_lf_lut[D117_PRED] = 1;
- lfi->mode_lf_lut[D153_PRED] = 1;
- lfi->mode_lf_lut[D27_PRED] = 1;
- lfi->mode_lf_lut[D63_PRED] = 1;
- lfi->mode_lf_lut[V_PRED] = 1;
- lfi->mode_lf_lut[H_PRED] = 1;
- lfi->mode_lf_lut[TM_PRED] = 1;
- lfi->mode_lf_lut[B_PRED] = 0;
- lfi->mode_lf_lut[I8X8_PRED] = 0;
- lfi->mode_lf_lut[ZEROMV] = 1;
- lfi->mode_lf_lut[NEARESTMV] = 2;
- lfi->mode_lf_lut[NEARMV] = 2;
- lfi->mode_lf_lut[NEWMV] = 2;
- lfi->mode_lf_lut[SPLITMV] = 3;
-}
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
- int sharpness_lvl) {
- int i;
-
- /* For each possible value for the loop filter fill out limits */
- for (i = 0; i <= MAX_LOOP_FILTER; i++) {
- int filt_lvl = i;
- int block_inside_limit = 0;
-
- /* Set loop filter paramaeters that control sharpness. */
- block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
- block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
-
- if (sharpness_lvl > 0) {
- if (block_inside_limit > (9 - sharpness_lvl))
- block_inside_limit = (9 - sharpness_lvl);
- }
-
- if (block_inside_limit < 1)
- block_inside_limit = 1;
-
- vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
- SIMD_WIDTH);
- vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
- SIMD_WIDTH);
- }
-}
-
-void vp9_loop_filter_init(VP9_COMMON *cm) {
- loop_filter_info_n *lfi = &cm->lf_info;
- int i;
-
- /* init limits for given sharpness*/
- vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
- cm->last_sharpness_level = cm->sharpness_level;
-
- /* init LUT for lvl and hev thr picking */
- lf_init_lut(lfi);
-
- /* init hev threshold const vectors */
- for (i = 0; i < 4; i++) {
- vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
- }
-}
-
-void vp9_loop_filter_frame_init(VP9_COMMON *cm,
- MACROBLOCKD *xd,
- int default_filt_lvl) {
- int seg, /* segment number */
- ref, /* index in ref_lf_deltas */
- mode; /* index in mode_lf_deltas */
-
- loop_filter_info_n *lfi = &cm->lf_info;
-
- /* update limits if sharpness has changed */
- if (cm->last_sharpness_level != cm->sharpness_level) {
- vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
- cm->last_sharpness_level = cm->sharpness_level;
- }
-
- for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
- int lvl_seg = default_filt_lvl;
- int lvl_ref, lvl_mode;
-
-
- // Set the baseline filter values for each segment
- if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
- /* Abs value */
- if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
- lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
- } else { /* Delta Value */
- lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
- lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
- }
- }
-
- if (!xd->mode_ref_lf_delta_enabled) {
- /* we could get rid of this if we assume that deltas are set to
- * zero when not in use; encoder always uses deltas
- */
- vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
- continue;
- }
-
- lvl_ref = lvl_seg;
-
- /* INTRA_FRAME */
- ref = INTRA_FRAME;
-
- /* Apply delta for reference frame */
- lvl_ref += xd->ref_lf_deltas[ref];
-
- /* Apply delta for Intra modes */
- mode = 0; /* B_PRED */
- /* Only the split mode BPRED has a further special case */
- lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
- lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
-
- lfi->lvl[seg][ref][mode] = lvl_mode;
-
- mode = 1; /* all the rest of Intra modes */
- lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */
- lfi->lvl[seg][ref][mode] = lvl_mode;
-
- /* LAST, GOLDEN, ALT */
- for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
- int lvl_ref = lvl_seg;
-
- /* Apply delta for reference frame */
- lvl_ref += xd->ref_lf_deltas[ref];
-
- /* Apply delta for Inter modes */
- for (mode = 1; mode < 4; mode++) {
- lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
- lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
-
- lfi->lvl[seg][ref][mode] = lvl_mode;
- }
- }
- }
-}
-
-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
- YV12_BUFFER_CONFIG *post = cm->frame_to_show;
- loop_filter_info_n *lfi_n = &cm->lf_info;
- struct loop_filter_info lfi;
-
- FRAME_TYPE frame_type = cm->frame_type;
-
- int mb_row;
- int mb_col;
-
- int filter_level;
-
- unsigned char *y_ptr, *u_ptr, *v_ptr;
-
- /* Point at base of Mb MODE_INFO list */
- const MODE_INFO *mode_info_context = cm->mi;
-
- /* Initialize the loop filter for this frame. */
- vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
-
- /* Set up the buffer pointers */
- y_ptr = post->y_buffer;
- u_ptr = post->u_buffer;
- v_ptr = post->v_buffer;
-
- /* vp9_filter each macro block */
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
- mode_info_context->mbmi.mode != I8X8_PRED &&
- mode_info_context->mbmi.mode != SPLITMV &&
- mode_info_context->mbmi.mb_skip_coeff);
-
- const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
- const int seg = mode_info_context->mbmi.segment_id;
- const int ref_frame = mode_info_context->mbmi.ref_frame;
- int tx_type = mode_info_context->mbmi.txfm_size;
- filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
- if (filter_level) {
- if (cm->filter_type == NORMAL_LOOPFILTER) {
- const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
- lfi.mblim = lfi_n->mblim[filter_level];
- lfi.blim = lfi_n->blim[filter_level];
- lfi.lim = lfi_n->lim[filter_level];
- lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
- if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
- && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
- mode_info_context[0].mbmi.mb_skip_coeff &&
- mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
- )
- vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
-
- if (!skip_lf && tx_type != TX_16X16) {
- if (tx_type == TX_8X8)
- vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
- else
- vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
-
- }
-
- /* don't apply across umv border */
- if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
- && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
- mode_info_context[0].mbmi.mb_skip_coeff &&
- mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
- )
- vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
-
- if (!skip_lf && tx_type != TX_16X16) {
- if (tx_type == TX_8X8)
- vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
- else
- vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
- post->uv_stride, &lfi);
- }
- } else {
- // FIXME: Not 8x8 aware
- if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
- && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
- mode_info_context[0].mbmi.mb_skip_coeff &&
- mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
- )
- vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
-
- /* don't apply across umv border */
- if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
- && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
- mode_info_context[0].mbmi.mb_skip_coeff &&
- mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
- )
- vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
- }
- }
-
- y_ptr += 16;
- u_ptr += 8;
- v_ptr += 8;
-
- mode_info_context++; /* step to next MB */
- }
-
- y_ptr += post->y_stride * 16 - post->y_width;
- u_ptr += post->uv_stride * 8 - post->uv_width;
- v_ptr += post->uv_stride * 8 - post->uv_width;
-
- mode_info_context++; /* Skip border mb */
- }
-}
-
-void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
- int default_filt_lvl) {
- YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-
- unsigned char *y_ptr;
- int mb_row;
- int mb_col;
-
- loop_filter_info_n *lfi_n = &cm->lf_info;
- struct loop_filter_info lfi;
-
- int filter_level;
- FRAME_TYPE frame_type = cm->frame_type;
-
- /* Point at base of Mb MODE_INFO list */
- const MODE_INFO *mode_info_context = cm->mi;
-
-#if 0
- if (default_filt_lvl == 0) /* no filter applied */
- return;
-#endif
-
- /* Initialize the loop filter for this frame. */
- vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
-
- /* Set up the buffer pointers */
- y_ptr = post->y_buffer;
-
- /* vp9_filter each macro block */
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
- mode_info_context->mbmi.mode != I8X8_PRED &&
- mode_info_context->mbmi.mode != SPLITMV &&
- mode_info_context->mbmi.mb_skip_coeff);
-
- const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
- const int seg = mode_info_context->mbmi.segment_id;
- const int ref_frame = mode_info_context->mbmi.ref_frame;
- int tx_type = mode_info_context->mbmi.txfm_size;
- filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
- if (filter_level) {
- if (cm->filter_type == NORMAL_LOOPFILTER) {
- const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
- lfi.mblim = lfi_n->mblim[filter_level];
- lfi.blim = lfi_n->blim[filter_level];
- lfi.lim = lfi_n->lim[filter_level];
- lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
- if (mb_col > 0)
- vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
- if (!skip_lf && tx_type != TX_16X16) {
- if (tx_type == TX_8X8)
- vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
- else
- vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
- }
-
- /* don't apply across umv border */
- if (mb_row > 0)
- vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
- if (!skip_lf && tx_type != TX_16X16) {
- if (tx_type == TX_8X8)
- vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
- else
- vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
- }
- } else {
- // FIXME: Not 8x8 aware
- if (mb_col > 0)
- vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
-
- /* don't apply across umv border */
- if (mb_row > 0)
- vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
- }
- }
-
- y_ptr += 16;
- mode_info_context++; /* step to next MB */
- }
-
- y_ptr += post->y_stride * 16 - post->y_width;
- mode_info_context++; /* Skip border mb */
- }
-}
-
-void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
- int default_filt_lvl) {
- YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-
- unsigned char *y_ptr;
- int mb_row;
- int mb_col;
- int mb_cols = post->y_width >> 4;
-
- int linestocopy, i;
-
- loop_filter_info_n *lfi_n = &cm->lf_info;
- struct loop_filter_info lfi;
-
- int filter_level;
- int alt_flt_enabled = xd->segmentation_enabled;
- FRAME_TYPE frame_type = cm->frame_type;
-
- const MODE_INFO *mode_info_context;
-
- int lvl_seg[MAX_MB_SEGMENTS];
-
- mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
-
- /* 3 is a magic number. 4 is probably magic too */
- linestocopy = (post->y_height >> (4 + 3));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
- /* Note the baseline filter values for each segment */
- /* See vp9_loop_filter_frame_init. Rather than call that for each change
- * to default_filt_lvl, copy the relevant calculation here.
- */
- if (alt_flt_enabled) {
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
- /* Abs value */
- if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
- lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
- }
- /* Delta Value */
- else {
- lvl_seg[i] = default_filt_lvl +
- vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
- lvl_seg[i] = (lvl_seg[i] > 0) ?
- ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;
- }
- }
- }
-
- /* Set up the buffer pointers */
- y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
-
- /* vp9_filter each macro block */
- for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {
- for (mb_col = 0; mb_col < mb_cols; mb_col++) {
- int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
- mode_info_context->mbmi.mode != I8X8_PRED &&
- mode_info_context->mbmi.mode != SPLITMV &&
- mode_info_context->mbmi.mb_skip_coeff);
-
- if (alt_flt_enabled)
- filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
- else
- filter_level = default_filt_lvl;
-
- if (filter_level) {
- if (cm->filter_type == NORMAL_LOOPFILTER) {
- const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
- lfi.mblim = lfi_n->mblim[filter_level];
- lfi.blim = lfi_n->blim[filter_level];
- lfi.lim = lfi_n->lim[filter_level];
- lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
- if (mb_col > 0)
- vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
- if (!skip_lf)
- vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
- vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
- if (!skip_lf)
- vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
- } else {
- if (mb_col > 0)
- vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
-
- vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
- lfi_n->mblim[filter_level]);
-
- if (!skip_lf)
- vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
- lfi_n->blim[filter_level]);
- }
- }
-
- y_ptr += 16;
- mode_info_context += 1; /* step to next MB */
- }
-
- y_ptr += post->y_stride * 16 - post->y_width;
- mode_info_context += 1; /* Skip border mb */
- }
-}
--- a/vp8/common/loopfilter.h
+++ /dev/null
@@ -1,104 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef loopfilter_h
-#define loopfilter_h
-
-#include "vpx_ports/mem.h"
-#include "vpx_config.h"
-#include "blockd.h"
-
-#define MAX_LOOP_FILTER 63
-
-typedef enum {
- NORMAL_LOOPFILTER = 0,
- SIMPLE_LOOPFILTER = 1
-} LOOPFILTERTYPE;
-
-#if ARCH_ARM
-#define SIMD_WIDTH 1
-#else
-#define SIMD_WIDTH 16
-#endif
-
-/* Need to align this structure so when it is declared and
- * passed it can be loaded into vector registers.
- */
-typedef struct {
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
- mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
- blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
- lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
- hev_thr[4][SIMD_WIDTH]);
- unsigned char lvl[4][4][4];
- unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
- unsigned char mode_lf_lut[MB_MODE_COUNT];
-} loop_filter_info_n;
-
-struct loop_filter_info {
- const unsigned char *mblim;
- const unsigned char *blim;
- const unsigned char *lim;
- const unsigned char *hev_thr;
-};
-
-#define prototype_loopfilter(sym) \
- void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
- const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
- void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
- int ystride, int uv_stride, struct loop_filter_info *lfi)
-
-#define prototype_simple_loopfilter(sym) \
- void sym(unsigned char *y, int ystride, const unsigned char *blimit)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/loopfilter_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/loopfilter_arm.h"
-#endif
-
-typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */
- int p, /* pitch */
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- unsigned char *v);
-
-/* assorted loopfilter functions which get used elsewhere */
-struct VP9Common;
-struct macroblockd;
-
-void vp9_loop_filter_init(struct VP9Common *cm);
-
-void vp9_loop_filter_frame_init(struct VP9Common *cm,
- struct macroblockd *mbd,
- int default_filt_lvl);
-
-void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
-
-void vp9_loop_filter_partial_frame(struct VP9Common *cm,
- struct macroblockd *mbd,
- int default_filt_lvl);
-
-void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
- struct macroblockd *mbd,
- int default_filt_lvl);
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
- int sharpness_lvl);
-
-#endif // loopfilter_h
--- a/vp8/common/loopfilter_filters.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef unsigned char uc;
-
-static __inline signed char signed_char_clamp(int t) {
- t = (t < -128 ? -128 : t);
- t = (t > 127 ? 127 : t);
- return (signed char) t;
-}
-
-
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char filter_mask(uc limit, uc blimit,
- uc p3, uc p2, uc p1, uc p0,
- uc q0, uc q1, uc q2, uc q3) {
- signed char mask = 0;
- mask |= (abs(p3 - p2) > limit) * -1;
- mask |= (abs(p2 - p1) > limit) * -1;
- mask |= (abs(p1 - p0) > limit) * -1;
- mask |= (abs(q1 - q0) > limit) * -1;
- mask |= (abs(q2 - q1) > limit) * -1;
- mask |= (abs(q3 - q2) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = ~mask;
- return mask;
-}
-
-/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
- signed char hev = 0;
- hev |= (abs(p1 - p0) > thresh) * -1;
- hev |= (abs(q1 - q0) > thresh) * -1;
- return hev;
-}
-
-static __inline void filter(signed char mask, uc hev, uc *op1,
- uc *op0, uc *oq0, uc *oq1)
-
-{
- signed char ps0, qs0;
- signed char ps1, qs1;
- signed char filter, Filter1, Filter2;
- signed char u;
-
- ps1 = (signed char) * op1 ^ 0x80;
- ps0 = (signed char) * op0 ^ 0x80;
- qs0 = (signed char) * oq0 ^ 0x80;
- qs1 = (signed char) * oq1 ^ 0x80;
-
- /* add outer taps if we have high edge variance */
- filter = signed_char_clamp(ps1 - qs1);
- filter &= hev;
-
- /* inner taps */
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
- filter &= mask;
-
- /* save bottom 3 bits so that we round one side +4 and the other +3
- * if it equals 4 we'll set to adjust by -1 to account for the fact
- * we'd round 3 the other way
- */
- Filter1 = signed_char_clamp(filter + 4);
- Filter2 = signed_char_clamp(filter + 3);
- Filter1 >>= 3;
- Filter2 >>= 3;
- u = signed_char_clamp(qs0 - Filter1);
- *oq0 = u ^ 0x80;
- u = signed_char_clamp(ps0 + Filter2);
- *op0 = u ^ 0x80;
- filter = Filter1;
-
- /* outer tap adjustments */
- filter += 1;
- filter >>= 1;
- filter &= ~hev;
-
- u = signed_char_clamp(qs1 - filter);
- *oq1 = u ^ 0x80;
- u = signed_char_clamp(ps1 + filter);
- *op1 = u ^ 0x80;
-
-}
-
-void vp9_loop_filter_horizontal_edge_c
-(
- unsigned char *s,
- int p, /* pitch */
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count
-) {
- int hev = 0; /* high edge variance */
- signed char mask = 0;
- int i = 0;
-
- /* loop filter designed to work using chars so that we can make maximum use
- * of 8 bit simd instructions.
- */
- do {
- mask = filter_mask(limit[0], blimit[0],
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
-
- hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
- filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
- ++s;
- } while (++i < count * 8);
-}
-
-void vp9_loop_filter_vertical_edge_c(unsigned char *s,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count) {
- int hev = 0; /* high edge variance */
- signed char mask = 0;
- int i = 0;
-
- /* loop filter designed to work using chars so that we can make maximum use
- * of 8 bit simd instructions.
- */
- do {
- mask = filter_mask(limit[0], blimit[0],
- s[-4], s[-3], s[-2], s[-1],
- s[0], s[1], s[2], s[3]);
-
- hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-
- filter(mask, hev, s - 2, s - 1, s, s + 1);
-
- s += p;
- } while (++i < count * 8);
-}
-static __inline signed char flatmask(uc thresh,
- uc p4, uc p3, uc p2, uc p1, uc p0,
- uc q0, uc q1, uc q2, uc q3, uc q4) {
- signed char flat = 0;
- flat |= (abs(p1 - p0) > 1) * -1;
- flat |= (abs(q1 - q0) > 1) * -1;
- flat |= (abs(p0 - p2) > 1) * -1;
- flat |= (abs(q0 - q2) > 1) * -1;
- flat |= (abs(p3 - p0) > 1) * -1;
- flat |= (abs(q3 - q0) > 1) * -1;
- flat |= (abs(p4 - p0) > 1) * -1;
- flat |= (abs(q4 - q0) > 1) * -1;
- flat = ~flat;
- return flat;
-}
-
-static __inline void mbfilter(signed char mask, uc hev, uc flat,
- uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
- uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
- /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
- if (flat && mask) {
- unsigned char p0, q0;
- unsigned char p1, q1;
- unsigned char p2, q2;
- unsigned char p3, q3;
- unsigned char p4, q4;
-
- p4 = *op4;
- p3 = *op3;
- p2 = *op2;
- p1 = *op1;
- p0 = *op0;
- q0 = *oq0;
- q1 = *oq1;
- q2 = *oq2;
- q3 = *oq3;
- q4 = *oq4;
-
- *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
- *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
- *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
- *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
- *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
- *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
- } else {
- signed char ps0, qs0;
- signed char ps1, qs1;
- signed char filter, Filter1, Filter2;
- signed char u;
-
- ps1 = (signed char) * op1 ^ 0x80;
- ps0 = (signed char) * op0 ^ 0x80;
- qs0 = (signed char) * oq0 ^ 0x80;
- qs1 = (signed char) * oq1 ^ 0x80;
-
- /* add outer taps if we have high edge variance */
- filter = signed_char_clamp(ps1 - qs1);
- filter &= hev;
-
- /* inner taps */
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
- filter &= mask;
-
- Filter1 = signed_char_clamp(filter + 4);
- Filter2 = signed_char_clamp(filter + 3);
- Filter1 >>= 3;
- Filter2 >>= 3;
-
- u = signed_char_clamp(qs0 - Filter1);
- *oq0 = u ^ 0x80;
- u = signed_char_clamp(ps0 + Filter2);
- *op0 = u ^ 0x80;
- filter = Filter1;
-
- /* outer tap adjustments */
- filter += 1;
- filter >>= 1;
- filter &= ~hev;
-
- u = signed_char_clamp(qs1 - filter);
- *oq1 = u ^ 0x80;
- u = signed_char_clamp(ps1 + filter);
- *op1 = u ^ 0x80;
- }
-}
-void vp9_mbloop_filter_horizontal_edge_c
-(
- unsigned char *s,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count
-) {
- signed char hev = 0; /* high edge variance */
- signed char mask = 0;
- signed char flat = 0;
- int i = 0;
-
- /* loop filter designed to work using chars so that we can make maximum use
- * of 8 bit simd instructions.
- */
- do {
-
- mask = filter_mask(limit[0], blimit[0],
- s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
- hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
- flat = flatmask(thresh[0],
- s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
- s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
- mbfilter(mask, hev, flat,
- s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
-
- ++s;
- } while (++i < count * 8);
-
-}
-void vp9_mbloop_filter_vertical_edge_c
-(
- unsigned char *s,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count
-) {
- signed char hev = 0; /* high edge variance */
- signed char mask = 0;
- signed char flat = 0;
- int i = 0;
-
- do {
-
- mask = filter_mask(limit[0], blimit[0],
- s[-4], s[-3], s[-2], s[-1],
- s[0], s[1], s[2], s[3]);
-
- hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
- flat = flatmask(thresh[0],
- s[-5], s[-4], s[-3], s[-2], s[-1],
- s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
- mbfilter(mask, hev, flat,
- s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4);
- s += p;
- } while (++i < count * 8);
-
-}
-
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char simple_filter_mask(uc blimit,
- uc p1, uc p0,
- uc q0, uc q1) {
- /* Why does this cause problems for win32?
- * error C2143: syntax error : missing ';' before 'type'
- * (void) limit;
- */
- signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
- return mask;
-}
-
-static __inline void simple_filter(signed char mask,
- uc *op1, uc *op0,
- uc *oq0, uc *oq1) {
- signed char filter, Filter1, Filter2;
- signed char p1 = (signed char) * op1 ^ 0x80;
- signed char p0 = (signed char) * op0 ^ 0x80;
- signed char q0 = (signed char) * oq0 ^ 0x80;
- signed char q1 = (signed char) * oq1 ^ 0x80;
- signed char u;
-
- filter = signed_char_clamp(p1 - q1);
- filter = signed_char_clamp(filter + 3 * (q0 - p0));
- filter &= mask;
-
- /* save bottom 3 bits so that we round one side +4 and the other +3 */
- Filter1 = signed_char_clamp(filter + 4);
- Filter1 >>= 3;
- u = signed_char_clamp(q0 - Filter1);
- *oq0 = u ^ 0x80;
-
- Filter2 = signed_char_clamp(filter + 3);
- Filter2 >>= 3;
- u = signed_char_clamp(p0 + Filter2);
- *op0 = u ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c
-(
- unsigned char *s,
- int p,
- const unsigned char *blimit
-) {
- signed char mask = 0;
- int i = 0;
-
- do {
- mask = simple_filter_mask(blimit[0],
- s[-2 * p], s[-1 * p],
- s[0 * p], s[1 * p]);
- simple_filter(mask,
- s - 2 * p, s - 1 * p,
- s, s + 1 * p);
- ++s;
- } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c
-(
- unsigned char *s,
- int p,
- const unsigned char *blimit
-) {
- signed char mask = 0;
- int i = 0;
-
- do {
- mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
- simple_filter(mask, s - 2, s - 1, s, s + 1);
- s += p;
- } while (++i < 16);
-
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
- lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_horizontal_edge_c(
- y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
-void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_vertical_edge_c(
- y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
-}
--- a/vp8/common/maskingmv.c
+++ /dev/null
@@ -1,806 +1,0 @@
-/*
- ============================================================================
- Name : maskingmv.c
- Author : jimbankoski
- Version :
- Copyright : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-extern unsigned int vp9_sad16x16_sse3(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- int max_err);
-
-extern void vp9_sad16x16x3_sse3(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- int *results);
-
-extern int vp8_growmaskmb_sse3(
- unsigned char *om,
- unsigned char *nm);
-
-extern void vp8_makemask_sse3(
- unsigned char *y,
- unsigned char *u,
- unsigned char *v,
- unsigned char *ym,
- int yp,
- int uvp,
- int ys,
- int us,
- int vs,
- int yt,
- int ut,
- int vt);
-
-unsigned int vp9_sad16x16_unmasked_wmt(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- unsigned char *mask);
-
-unsigned int vp9_sad16x16_masked_wmt(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- unsigned char *mask);
-
-unsigned int vp8_masked_predictor_wmt(
- unsigned char *masked,
- unsigned char *unmasked,
- int src_stride,
- unsigned char *dst_ptr,
- int dst_stride,
- unsigned char *mask);
-unsigned int vp8_masked_predictor_uv_wmt(
- unsigned char *masked,
- unsigned char *unmasked,
- int src_stride,
- unsigned char *dst_ptr,
- int dst_stride,
- unsigned char *mask);
-unsigned int vp8_uv_from_y_mask(
- unsigned char *ymask,
- unsigned char *uvmask);
-int yp = 16;
-unsigned char sxy[] = {
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
-};
-
-unsigned char sts[] = {
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-unsigned char str[] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-unsigned char y[] = {
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
-};
-int uvp = 8;
-unsigned char u[] = {
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 84, 70, 70, 90, 90, 90, 17, 17,
- 84, 70, 70, 90, 90, 90, 17, 17,
- 80, 70, 70, 90, 90, 90, 17, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17
-};
-
-unsigned char v[] = {
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80
-};
-
-unsigned char ym[256];
-unsigned char uvm[64];
-typedef struct {
- unsigned char y;
- unsigned char yt;
- unsigned char u;
- unsigned char ut;
- unsigned char v;
- unsigned char vt;
- unsigned char use;
-} COLOR_SEG_ELEMENT;
-
-/*
-COLOR_SEG_ELEMENT segmentation[]=
-{
- { 60,4,80,17,80,10, 1},
- { 40,4,15,10,80,10, 1},
-};
-*/
-
-COLOR_SEG_ELEMENT segmentation[] = {
- { 79, 44, 92, 44, 237, 60, 1},
-};
-
-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
- COLOR_SEG_ELEMENT sgm[],
- int c) {
- COLOR_SEG_ELEMENT *s = sgm;
- unsigned char m = 0;
- int i;
- for (i = 0; i < c; i++, s++)
- m |= (abs(y - s->y) < s->yt &&
- abs(u - s->u) < s->ut &&
- abs(v - s->v) < s->vt ? 255 : 0);
-
- return m;
-}
-int neighbors[256][8];
-int makeneighbors(void) {
- int i, j;
- for (i = 0; i < 256; i++) {
- int r = (i >> 4), c = (i & 15);
- int ni = 0;
- for (j = 0; j < 8; j++)
- neighbors[i][j] = i;
- for (j = 0; j < 256; j++) {
- int nr = (j >> 4), nc = (j & 15);
- if (abs(nr - r) < 2 && abs(nc - c) < 2)
- neighbors[i][ni++] = j;
- }
- }
- return 0;
-}
-void grow_ymask(unsigned char *ym) {
- unsigned char nym[256];
- int i, j;
-
- for (i = 0; i < 256; i++) {
- nym[i] = ym[i];
- for (j = 0; j < 8; j++) {
- nym[i] |= ym[neighbors[i][j]];
- }
- }
- for (i = 0; i < 256; i++)
- ym[i] = nym[i];
-}
-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
- unsigned char *ym, unsigned char *uvm,
- int yp, int uvp,
- COLOR_SEG_ELEMENT sgm[],
- int count) {
- int r, c;
- unsigned char *oym = ym;
-
- memset(ym, 20, 256);
- for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
- for (c = 0; c < 8; c++) {
- int y1 = y[c << 1];
- int u1 = u[c];
- int v1 = v[c];
- int m = pixel_mask(y1, u1, v1, sgm, count);
- uvm[c] = m;
- ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
- ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
- ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
- ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
- }
- grow_ymask(oym);
-}
-
-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
- for (j = 0; j < 16; j++)
- if (ym[j])
- sad += abs(src[j] - dst[j]);
-
- return sad;
-}
-
-int compare_masks(unsigned char *sym, unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, sym += 16, ym += 16)
- for (j = 0; j < 16; j++)
- sad += (sym[j] != ym[j] ? 1 : 0);
-
- return sad;
-}
-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
- for (j = 0; j < 16; j++)
- if (!ym[j])
- sad += abs(src[j] - dst[j]);
-
- return sad;
-}
-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
- int yp, int uvp,
- unsigned char *dy, unsigned char *du, unsigned char *dv,
- int dyp, int duvp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int *mi,
- int *mj,
- int *ui,
- int *uj,
- int *wm) {
- int i, j;
-
- unsigned char ym[256];
- unsigned char uvm[64];
- unsigned char dym[256];
- unsigned char duvm[64];
- unsigned int e = 0;
- int beste = 256;
- int bmi = -32, bmj = -32;
- int bui = -32, buj = -32;
- int beste1 = 256;
- int bmi1 = -32, bmj1 = -32;
- int bui1 = -32, buj1 = -32;
- int obeste;
-
- // first try finding best mask and then unmasked
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
- e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- // bui=0;buj=0;
- // best mv masked destination
- make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
- dym, duvm, dyp, duvp, sgm, count);
-
- obeste = beste;
- beste = 0xffffffff;
-
- // find best masked
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = masked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
- beste1 = beste + obeste;
- bmi1 = bmi;
- bmj1 = bmj;
- bui1 = bui;
- buj1 = buj;
-
- beste = 0xffffffff;
- // source mask
- make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
-
- // find best mask
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
- e = compare_masks(ym, dym);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
-
-
- // best mv masked destination
- make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
- dym, duvm, dyp, duvp, sgm, count);
-
- obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
-
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- beste += obeste;
-
-
- if (beste < beste1) {
- *mi = bmi;
- *mj = bmj;
- *ui = bui;
- *uj = buj;
- *wm = 1;
- } else {
- *mi = bmi1;
- *mj = bmj1;
- *ui = bui1;
- *uj = buj1;
- *wm = 0;
-
- }
- return 0;
-}
-
-int predict(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym, unsigned char *prd) {
- int i, j;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
- for (j = 0; j < 16; j++)
- prd[j] = (ym[j] ? src[j] : dst[j]);
- return 0;
-}
-
-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
- int yp, int uvp,
- unsigned char *dy, unsigned char *du, unsigned char *dv,
- int dyp, int duvp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int *mi,
- int *mj,
- int *ui,
- int *uj,
- int *wm) {
- int i, j;
-
- unsigned char ym[256];
- unsigned char ym2[256];
- unsigned char uvm[64];
- unsigned char dym2[256];
- unsigned char dym[256];
- unsigned char duvm[64];
- unsigned int e = 0;
- int beste = 256;
- int bmi = -32, bmj = -32;
- int bui = -32, buj = -32;
- int beste1 = 256;
- int bmi1 = -32, bmj1 = -32;
- int bui1 = -32, buj1 = -32;
- int obeste;
-
- // first try finding best mask and then unmasked
- beste = 0xffffffff;
-
-#if 0
- for (i = 0; i < 16; i++) {
- unsigned char *dy = i * yp + y;
- for (j = 0; j < 16; j++)
- printf("%2x", dy[j]);
- printf("\n");
- }
- printf("\n");
-
- for (i = -32; i < 48; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 48; j++)
- printf("%2x", dyz[j]);
- printf("\n");
- }
-#endif
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- // bui=0;buj=0;
- // best mv masked destination
-
- vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
- dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- obeste = beste;
- beste = 0xffffffff;
-
- // find best masked
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
- beste1 = beste + obeste;
- bmi1 = bmi;
- bmj1 = bmj;
- bui1 = bui;
- buj1 = buj;
-
- // source mask
- vp8_makemask_sse3(y, u, v,
- ym, yp, uvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(ym, ym2);
-
- // find best mask
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- e = compare_masks(ym2, dym2);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
-
- vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
- dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
-
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- beste += obeste;
-
- if (beste < beste1) {
- *mi = bmi;
- *mj = bmj;
- *ui = bui;
- *uj = buj;
- *wm = 1;
- } else {
- *mi = bmi1;
- *mj = bmj1;
- *ui = bui1;
- *uj = buj1;
- *wm = 0;
- beste = beste1;
-
- }
- return beste;
-}
-
-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
- int ymp, int uvmp,
- unsigned char *yp, unsigned char *up, unsigned char *vp,
- int ypp, int uvpp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int mi,
- int mj,
- int ui,
- int uj,
- int wm) {
- int i, j;
- unsigned char dym[256];
- unsigned char dym2[256];
- unsigned char duvm[64];
- unsigned char *yu = ym, *uu = um, *vu = vm;
-
- unsigned char *dym3 = dym2;
-
- ym += mi * ymp + mj;
- um += mi / 2 * uvmp + mj / 2;
- vm += mi / 2 * uvmp + mj / 2;
-
- yu += ui * ymp + uj;
- uu += ui / 2 * uvmp + uj / 2;
- vu += ui / 2 * uvmp + uj / 2;
-
- // best mv masked destination
- if (wm)
- vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
- else
- vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
- vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
- vp8_uv_from_y_mask(dym3, duvm);
- vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
- vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
-
- return 0;
-}
-
-unsigned char f0p[1280 * 720 * 3 / 2];
-unsigned char f1p[1280 * 720 * 3 / 2];
-unsigned char prd[1280 * 720 * 3 / 2];
-unsigned char msk[1280 * 720 * 3 / 2];
-
-
-int mainz(int argc, char *argv[]) {
-
- FILE *f = fopen(argv[1], "rb");
- FILE *g = fopen(argv[2], "wb");
- int w = atoi(argv[3]), h = atoi(argv[4]);
- int y_stride = w, uv_stride = w / 2;
- int r, c;
- unsigned char *f0 = f0p, *f1 = f1p, *t;
- unsigned char ym[256], uvm[64];
- unsigned char ym2[256], uvm2[64];
- unsigned char ym3[256], uvm3[64];
- int a, b;
-
- COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
-#if 0
- makeneighbors();
- COLOR_SEG_ELEMENT segmentation[] = {
- { 60, 4, 80, 17, 80, 10, 1},
- { 40, 4, 15, 10, 80, 10, 1},
- };
- make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
-
- vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
- (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
- segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
-
- vp8_growmaskmb_sse3(ym, ym3);
-
- a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
- b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
-
- vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
-
- vp8_uv_from_y_mask(ym3, uvm3);
-
- return 4;
-#endif
- makeneighbors();
-
-
- memset(prd, 128, w * h * 3 / 2);
-
- fread(f0, w * h * 3 / 2, 1, f);
-
- while (!feof(f)) {
- unsigned char *ys = f1, *yd = f0, *yp = prd;
- unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
- unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
- fread(f1, w * h * 3 / 2, 1, f);
-
- ys += 32 * y_stride;
- yd += 32 * y_stride;
- yp += 32 * y_stride;
- us += 16 * uv_stride;
- ud += 16 * uv_stride;
- up += 16 * uv_stride;
- vs += 16 * uv_stride;
- vd += 16 * uv_stride;
- vp += 16 * uv_stride;
- for (r = 32; r < h - 32; r += 16,
- ys += 16 * w, yd += 16 * w, yp += 16 * w,
- us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
- vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
- for (c = 32; c < w - 32; c += 16) {
- int mi, mj, ui, uj, wm;
- int bmi, bmj, bui, buj, bwm;
- unsigned char ym[256];
-
- if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
- bmi = bmj = bui = buj = bwm = 0;
- else {
- COLOR_SEG_ELEMENT cs[5];
- int j;
- unsigned int beste = 0xfffffff;
- unsigned int bestj = 0;
-
- // try color from last mb segmentation
- cs[0] = last;
-
- // try color segs from 4 pixels in mb recon as segmentation
- cs[1].y = yd[c + y_stride + 1];
- cs[1].u = ud[c / 2 + uv_stride];
- cs[1].v = vd[c / 2 + uv_stride];
- cs[1].yt = cs[1].ut = cs[1].vt = 20;
- cs[2].y = yd[c + w + 14];
- cs[2].u = ud[c / 2 + uv_stride + 7];
- cs[2].v = vd[c / 2 + uv_stride + 7];
- cs[2].yt = cs[2].ut = cs[2].vt = 20;
- cs[3].y = yd[c + w * 14 + 1];
- cs[3].u = ud[c / 2 + uv_stride * 7];
- cs[3].v = vd[c / 2 + uv_stride * 7];
- cs[3].yt = cs[3].ut = cs[3].vt = 20;
- cs[4].y = yd[c + w * 14 + 14];
- cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
- cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
- cs[4].yt = cs[4].ut = cs[4].vt = 20;
-
- for (j = 0; j < 5; j++) {
- int e;
-
- e = fast_masked_motion_search(
- ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
- yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
- &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
-
- if (e < beste) {
- bmi = mi;
- bmj = mj;
- bui = ui;
- buj = uj, bwm = wm;
- bestj = j;
- beste = e;
- }
- }
- best = cs[bestj];
- // best = segmentation[0];
- last = best;
- }
- predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
- yp + c, up + c / 2, vp + c / 2, w, uv_stride,
- &best, 1, bmi, bmj, bui, buj, bwm);
-
- }
- }
- fwrite(prd, w * h * 3 / 2, 1, g);
- t = f0;
- f0 = f1;
- f1 = t;
-
- }
- fclose(f);
- fclose(g);
- return;
-}
--- a/vp8/common/mbpitch.c
+++ /dev/null
@@ -1,124 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "blockd.h"
-
-typedef enum {
- PRED = 0,
- DEST = 1
-} BLOCKSET;
-
-static void setup_block
-(
- BLOCKD *b,
- int mv_stride,
- unsigned char **base,
- unsigned char **base2,
- int Stride,
- int offset,
- BLOCKSET bs
-) {
-
- if (bs == DEST) {
- b->dst_stride = Stride;
- b->dst = offset;
- b->base_dst = base;
- } else {
- b->pre_stride = Stride;
- b->pre = offset;
- b->base_pre = base;
- b->base_second_pre = base2;
- }
-
-}
-
-
-static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
- int block;
-
- unsigned char **y, **u, **v;
- unsigned char **y2, **u2, **v2;
- BLOCKD *blockd = xd->block;
- int stride;
-
- if (bs == DEST) {
- y = &xd->dst.y_buffer;
- u = &xd->dst.u_buffer;
- v = &xd->dst.v_buffer;
- } else {
- y = &xd->pre.y_buffer;
- u = &xd->pre.u_buffer;
- v = &xd->pre.v_buffer;
-
- y2 = &xd->second_pre.y_buffer;
- u2 = &xd->second_pre.u_buffer;
- v2 = &xd->second_pre.v_buffer;
- }
-
- stride = xd->dst.y_stride;
- for (block = 0; block < 16; block++) { /* y blocks */
- setup_block(&blockd[block], stride, y, y2, stride,
- (block >> 2) * 4 * stride + (block & 3) * 4, bs);
- }
-
- stride = xd->dst.uv_stride;
- for (block = 16; block < 20; block++) { /* U and V blocks */
- setup_block(&blockd[block], stride, u, u2, stride,
- ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
-
- setup_block(&blockd[block + 4], stride, v, v2, stride,
- ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
- }
-}
-
-void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
- int r, c;
- BLOCKD *blockd = xd->block;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];
- blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;
- }
- }
-
- for (r = 0; r < 2; r++) {
- for (c = 0; c < 2; c++) {
- blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];
- blockd[16 + r * 2 + c].predictor =
- xd->predictor + 256 + r * 4 * 8 + c * 4;
-
- }
- }
-
- for (r = 0; r < 2; r++) {
- for (c = 0; c < 2; c++) {
- blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];
- blockd[20 + r * 2 + c].predictor =
- xd->predictor + 320 + r * 4 * 8 + c * 4;
-
- }
- }
-
- blockd[24].diff = &xd->diff[384];
-
- for (r = 0; r < 25; r++) {
- blockd[r].qcoeff = xd->qcoeff + r * 16;
- blockd[r].dqcoeff = xd->dqcoeff + r * 16;
- }
-}
-
-void vp9_build_block_doffsets(MACROBLOCKD *xd) {
-
- /* handle the destination pitch features */
- setup_macroblock(xd, DEST);
- setup_macroblock(xd, PRED);
-}
--- a/vp8/common/modecont.c
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropy.h"
-const int vp9_default_mode_contexts[6][4] = {
- {
- /* 0 */
- 7, 1, 1, 183
- },
- {
- /* 1 */
- 14, 18, 14, 147
- },
- {
- /* 2 */
- 135, 64, 57, 68
- },
- {
- /* 3 */
- 60, 56, 128, 65
- },
- {
- /* 4 */
- 159, 134, 128, 34
- },
- {
- /* 5 */
- 234, 188, 128, 28
- },
-};
-const int vp9_default_mode_contexts_a[6][4] = {
- {
- /* 0 */
- 4, 1, 1, 143
- },
- {
- /* 1 */
- 7, 9, 7, 107
- },
- {
- /* 2 */
- 95, 34, 57, 68
- },
- {
- /* 3 */
- 95, 56, 128, 65
- },
- {
- /* 4 */
- 159, 67, 128, 34
- },
- {
- /* 5 */
- 234, 94, 128, 28
- },
-};
--- a/vp8/common/modecont.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MODECONT_H
-#define __INC_MODECONT_H
-
-extern const int vp9_default_mode_contexts[6][4];
-extern const int vp9_default_mode_contexts_a[6][4];
-#endif
--- a/vp8/common/modecontext.c
+++ /dev/null
@@ -1,145 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "entropymode.h"
-
-const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {
- {
- /*Above Mode : 0*/
- { 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, /* left_mode 0 */
- { 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, /* left_mode 1 */
- { 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, /* left_mode 2 */
- { 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, /* left_mode 3 */
- { 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, /* left_mode 4 */
- { 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, /* left_mode 5 */
- { 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, /* left_mode 6 */
- { 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, /* left_mode 7 */
- { 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, /* left_mode 8 */
- { 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 1*/
- { 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, /* left_mode 0 */
- { 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, /* left_mode 1 */
- { 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, /* left_mode 2 */
- { 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, /* left_mode 3 */
- { 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, /* left_mode 4 */
- { 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, /* left_mode 5 */
- { 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, /* left_mode 6 */
- { 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, /* left_mode 7 */
- { 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, /* left_mode 8 */
- { 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 2*/
- { 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, /* left_mode 0 */
- { 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, /* left_mode 1 */
- { 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, /* left_mode 2 */
- { 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, /* left_mode 3 */
- { 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, /* left_mode 4 */
- { 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, /* left_mode 5 */
- { 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, /* left_mode 6 */
- { 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, /* left_mode 7 */
- { 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, /* left_mode 8 */
- { 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 3*/
- { 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, /* left_mode 0 */
- { 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, /* left_mode 1 */
- { 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, /* left_mode 2 */
- { 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, /* left_mode 3 */
- { 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, /* left_mode 4 */
- { 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, /* left_mode 5 */
- { 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, /* left_mode 6 */
- { 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, /* left_mode 7 */
- { 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, /* left_mode 8 */
- { 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 4*/
- { 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, /* left_mode 0 */
- { 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, /* left_mode 1 */
- { 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, /* left_mode 2 */
- { 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, /* left_mode 3 */
- { 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, /* left_mode 4 */
- { 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, /* left_mode 5 */
- { 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, /* left_mode 6 */
- { 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, /* left_mode 7 */
- { 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, /* left_mode 8 */
- { 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 5*/
- { 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, /* left_mode 0 */
- { 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, /* left_mode 1 */
- { 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, /* left_mode 2 */
- { 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, /* left_mode 3 */
- { 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, /* left_mode 4 */
- { 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, /* left_mode 5 */
- { 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, /* left_mode 6 */
- { 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, /* left_mode 7 */
- { 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, /* left_mode 8 */
- { 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 6*/
- { 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, /* left_mode 0 */
- { 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, /* left_mode 1 */
- { 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, /* left_mode 2 */
- { 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, /* left_mode 3 */
- { 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, /* left_mode 4 */
- { 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, /* left_mode 5 */
- { 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, /* left_mode 6 */
- { 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, /* left_mode 7 */
- { 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, /* left_mode 8 */
- { 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 7*/
- { 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, /* left_mode 0 */
- { 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, /* left_mode 1 */
- { 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, /* left_mode 2 */
- { 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, /* left_mode 3 */
- { 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, /* left_mode 4 */
- { 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, /* left_mode 5 */
- { 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, /* left_mode 6 */
- { 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, /* left_mode 7 */
- { 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, /* left_mode 8 */
- { 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 8*/
- { 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, /* left_mode 0 */
- { 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, /* left_mode 1 */
- { 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, /* left_mode 2 */
- { 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, /* left_mode 3 */
- { 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, /* left_mode 4 */
- { 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, /* left_mode 5 */
- { 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, /* left_mode 6 */
- { 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, /* left_mode 7 */
- { 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, /* left_mode 8 */
- { 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, /* left_mode 9 */
- },
- {
- /*Above Mode : 9*/
- { 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, /* left_mode 0 */
- { 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, /* left_mode 1 */
- { 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, /* left_mode 2 */
- { 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, /* left_mode 3 */
- { 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, /* left_mode 4 */
- { 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, /* left_mode 5 */
- { 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, /* left_mode 6 */
- { 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, /* left_mode 7 */
- { 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, /* left_mode 8 */
- { 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, /* left_mode 9 */
- },
-};
--- a/vp8/common/mv.h
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MV_H
-#define __INC_MV_H
-#include "vpx/vpx_integer.h"
-
-typedef struct {
- short row;
- short col;
-} MV;
-
-typedef union {
- uint32_t as_int;
- MV as_mv;
-} int_mv; /* facilitates faster equality tests and copies */
-
-#endif
--- a/vp8/common/mvref_common.c
+++ /dev/null
@@ -1,342 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "mvref_common.h"
-
-#if CONFIG_NEWBESTREFMV
-
-#define MVREF_NEIGHBOURS 8
-static int mv_ref_search[MVREF_NEIGHBOURS][2] =
- { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
-static int ref_distance_weight[MVREF_NEIGHBOURS] =
- { 3,3,2,1,1,1,1,1 };
-
-// clamp_mv
-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
-static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
-
- if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
- mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
- else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
- mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
-
- if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
- mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
- else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
- mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
-}
-
-
-// Gets a best matching candidate refenence motion vector
-// from the given mode info structure (if available)
-static int get_candidate_mvref(
- const MODE_INFO *candidate_mi,
- MV_REFERENCE_FRAME ref_frame,
- MV_REFERENCE_FRAME *c_ref_frame,
- int_mv *c_mv,
- MV_REFERENCE_FRAME *c2_ref_frame,
- int_mv *c2_mv
-) {
-
- int ret_val = FALSE;
- c2_mv->as_int = 0;
- *c2_ref_frame = INTRA_FRAME;
-
- // Target ref frame matches candidate first ref frame
- if (ref_frame == candidate_mi->mbmi.ref_frame) {
- c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
- *c_ref_frame = ref_frame;
- ret_val = TRUE;
-
- // Is there a second non zero vector we can use.
- if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
- (candidate_mi->mbmi.mv[1].as_int != 0) &&
- (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
- c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
- }
-
- // Target ref frame matches candidate second ref frame
- } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
- c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- *c_ref_frame = ref_frame;
- ret_val = TRUE;
-
- // Is there a second non zero vector we can use.
- if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&
- (candidate_mi->mbmi.mv[0].as_int != 0) &&
- (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
- c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
- *c2_ref_frame = candidate_mi->mbmi.ref_frame;
- }
-
- // No ref frame matches so use first ref mv as first choice
- } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
- c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
- *c_ref_frame = candidate_mi->mbmi.ref_frame;
- ret_val = TRUE;
-
- // Is there a second non zero vector we can use.
- if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
- (candidate_mi->mbmi.mv[1].as_int != 0) &&
- (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
- c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
- }
-
- // If only the second ref mv is valid:- (Should not trigger in current code
- // base given current possible compound prediction options).
- } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
- c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
- ret_val = TRUE;
- }
-
- return ret_val;
-}
-
-// Performs mv adjustment based on reference frame and clamps the MV
-// if it goes off the edge of the buffer.
-static void scale_mv(
- MACROBLOCKD *xd,
- MV_REFERENCE_FRAME this_ref_frame,
- MV_REFERENCE_FRAME candidate_ref_frame,
- int_mv *candidate_mv,
- int *ref_sign_bias
-) {
-
- if (candidate_ref_frame != this_ref_frame) {
-
- //int frame_distances[MAX_REF_FRAMES];
- //int last_distance = 1;
- //int gf_distance = xd->frames_since_golden;
- //int arf_distance = xd->frames_till_alt_ref_frame;
-
- // Sign inversion where appropriate.
- if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
- candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
- candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
- }
-
- // Scale based on frame distance if the reference frames not the same.
- /*frame_distances[INTRA_FRAME] = 1; // should never be used
- frame_distances[LAST_FRAME] = 1;
- frame_distances[GOLDEN_FRAME] =
- (xd->frames_since_golden) ? xd->frames_since_golden : 1;
- frame_distances[ALTREF_FRAME] =
- (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
-
- if (frame_distances[this_ref_frame] &&
- frame_distances[candidate_ref_frame]) {
- candidate_mv->as_mv.row =
- (short)(((int)(candidate_mv->as_mv.row) *
- frame_distances[this_ref_frame]) /
- frame_distances[candidate_ref_frame]);
-
- candidate_mv->as_mv.col =
- (short)(((int)(candidate_mv->as_mv.col) *
- frame_distances[this_ref_frame]) /
- frame_distances[candidate_ref_frame]);
- }
- */
- }
-
- // Clamp the MV so it does not point out of the frame buffer
- clamp_mv(xd, candidate_mv);
-}
-
-// Adds a new candidate reference vector to the list if indeed it is new.
-// If it is not new then the score of the existing candidate that it matches
-// is increased and the list is resorted.
-static void addmv_and_shuffle(
- int_mv *mv_list,
- int *mv_scores,
- int *index,
- int_mv candidate_mv,
- int weight
-) {
-
- int i = *index;
- int duplicate_found = FALSE;
-
- // Check for duplicates. If there is one increment its score.
- // Duplicate defined as being the same full pel vector with rounding.
- while (i > 0) {
- i--;
-
- if (candidate_mv.as_int == mv_list[i].as_int) {
- duplicate_found = TRUE;
- mv_scores[i] += weight;
- break;
- }
- }
-
- // If no duplicate was found add the new vector and give it a weight
- if (!duplicate_found) {
- mv_list[*index].as_int = candidate_mv.as_int;
- mv_scores[*index] = weight;
- i = *index;
- (*index)++;
- }
-
- // Reshuffle the list so that highest scoring mvs at the top.
- while (i > 0) {
- if (mv_scores[i] > mv_scores[i-1]) {
- int tmp_score = mv_scores[i-1];
- int_mv tmp_mv = mv_list[i-1];
-
- mv_scores[i-1] = mv_scores[i];
- mv_list[i-1] = mv_list[i];
- mv_scores[i] = tmp_score;
- mv_list[i] = tmp_mv;
- i--;
- } else
- break;
- }
-}
-
-// This function searches the neighbourhood of a given MB/SB and populates a
-// list of candidate reference vectors.
-//
-void vp9_find_mv_refs(
- MACROBLOCKD *xd,
- MODE_INFO *here,
- MODE_INFO *lf_here,
- MV_REFERENCE_FRAME ref_frame,
- int_mv *mv_ref_list,
- int *ref_sign_bias
-) {
-
- int i;
- MODE_INFO *candidate_mi;
- int_mv candidate_mvs[MAX_MV_REFS];
- int_mv c_refmv;
- MV_REFERENCE_FRAME c_ref_frame;
- int_mv c2_refmv;
- MV_REFERENCE_FRAME c2_ref_frame;
- int candidate_scores[MAX_MV_REFS];
- int index = 0;
- int ref_weight = 0;
- int valid_mv_ref;
-
- // Blank the reference vector lists and other local structures.
- vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
- vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
- vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
-
- // Populate a list with candidate reference vectors from the
- // spatial neighbours.
- for (i = 0; i < 2; ++i) {
- if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
- ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
- candidate_mi = here + mv_ref_search[i][0] +
- (mv_ref_search[i][1] * xd->mode_info_stride);
-
- valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
- &c_ref_frame, &c_refmv,
- &c2_ref_frame, &c2_refmv);
-
- // If there is a valid MV candidate then add it to the list
- if (valid_mv_ref) {
- scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
- ref_weight = ref_distance_weight[i] +
- ((c_ref_frame == ref_frame) << 4);
-
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c_refmv, ref_weight);
-
- // If there is a second valid mv then add it as well.
- if (c2_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
- ref_weight = ref_distance_weight[i] +
- ((c2_ref_frame == ref_frame) << 4);
-
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c2_refmv, ref_weight);
- }
- }
- }
- }
-
- // Look at the corresponding vector in the last frame
- candidate_mi = lf_here;
- valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
- &c_ref_frame, &c_refmv,
- &c2_ref_frame, &c2_refmv);
-
- // If there is a valid MV candidate then add it to the list
- if (valid_mv_ref) {
- scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
- ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c_refmv, ref_weight);
-
- // If there is a second valid mv then add it as well.
- if (c2_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
- ref_weight = ref_distance_weight[i] +
- ((c2_ref_frame == ref_frame) << 4);
-
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c2_refmv, ref_weight);
- }
- }
-
- // Populate a list with candidate reference vectors from the
- // spatial neighbours.
- for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
- if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
- ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
- candidate_mi = here + mv_ref_search[i][0] +
- (mv_ref_search[i][1] * xd->mode_info_stride);
-
- valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
- &c_ref_frame, &c_refmv,
- &c2_ref_frame, &c2_refmv);
-
- // If there is a valid MV candidate then add it to the list
- if (valid_mv_ref) {
- scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
- ref_weight = ref_distance_weight[i] +
- ((c_ref_frame == ref_frame) << 4);
-
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c_refmv, ref_weight);
-
- // If there is a second valid mv then add it as well.
- if (c2_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
- ref_weight = ref_distance_weight[i] +
- ((c2_ref_frame == ref_frame) << 4);
-
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c2_refmv, ref_weight);
- }
- }
- }
- }
-
- // 0,0 is always a valid reference.
- for (i = 0; i < index; ++i)
- if (candidate_mvs[i].as_int == 0)
- break;
- if (i == index) {
- c_refmv.as_int = 0;
- addmv_and_shuffle(candidate_mvs, candidate_scores,
- &index, c_refmv, candidate_scores[3]+1 );
- }
-
- // Copy over the candidate list.
- vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
-}
-
-#endif
--- a/vp8/common/mvref_common.h
+++ /dev/null
@@ -1,31 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "onyxc_int.h"
-#include "blockd.h"
-
-// MR reference entropy header file.
-#if CONFIG_NEWBESTREFMV
-
-#ifndef __INC_MVREF_COMMON_H
-#define __INC_MVREF_COMMON_H
-
-void vp9_find_mv_refs(
- MACROBLOCKD *xd,
- MODE_INFO *here,
- MODE_INFO *lf_here,
- MV_REFERENCE_FRAME ref_frame,
- int_mv * mv_ref_list,
- int *ref_sign_bias
-);
-
-#endif
-
-#endif
--- a/vp8/common/onyx.h
+++ /dev/null
@@ -1,225 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYX_H
-#define __INC_ONYX_H
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx/vp8cx.h"
-#include "vpx_scale/yv12config.h"
-#include "type_aliases.h"
-#include "ppflags.h"
- typedef int *VP9_PTR;
-
- /* Create/destroy static data structures. */
-
- typedef enum {
- NORMAL = 0,
- FOURFIVE = 1,
- THREEFIVE = 2,
- ONETWO = 3
-
- } VPX_SCALING;
-
- typedef enum {
- VP9_LAST_FLAG = 1,
- VP9_GOLD_FLAG = 2,
- VP9_ALT_FLAG = 4
- } VP9_REFFRAME;
-
-
- typedef enum {
- USAGE_STREAM_FROM_SERVER = 0x0,
- USAGE_LOCAL_FILE_PLAYBACK = 0x1,
- USAGE_CONSTRAINED_QUALITY = 0x2
- } END_USAGE;
-
-
- typedef enum {
- MODE_GOODQUALITY = 0x1,
- MODE_BESTQUALITY = 0x2,
- MODE_FIRSTPASS = 0x3,
- MODE_SECONDPASS = 0x4,
- MODE_SECONDPASS_BEST = 0x5,
- } MODE;
-
- typedef enum {
- FRAMEFLAGS_KEY = 1,
- FRAMEFLAGS_GOLDEN = 2,
- FRAMEFLAGS_ALTREF = 4,
- } FRAMETYPE_FLAGS;
-
-
-#include <assert.h>
- static __inline void Scale2Ratio(int mode, int *hr, int *hs) {
- switch (mode) {
- case NORMAL:
- *hr = 1;
- *hs = 1;
- break;
- case FOURFIVE:
- *hr = 4;
- *hs = 5;
- break;
- case THREEFIVE:
- *hr = 3;
- *hs = 5;
- break;
- case ONETWO:
- *hr = 1;
- *hs = 2;
- break;
- default:
- *hr = 1;
- *hs = 1;
- assert(0);
- break;
- }
- }
-
- typedef struct {
- int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
- int Width; // width of data passed to the compressor
- int Height; // height of data passed to the compressor
- double frame_rate; // set to passed in framerate
- int target_bandwidth; // bandwidth to be used in kilobits per second
-
- int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
- int Sharpness; // parameter used for sharpening output: recommendation 0:
- int cpu_used;
- unsigned int rc_max_intra_bitrate_pct;
-
- // mode ->
- // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
- // a television signal or feed from a live camera). ( speed setting controls how fast )
- // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
- // encode the output. ( speed setting controls how fast )
- // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
- // speed. The output is compressed at the highest possible quality. This option takes the longest
- // amount of time to encode. ( speed setting ignored )
- // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
- // pass. ( speed setting controls how fast )
- // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
- // pass to create the compressed output. ( speed setting controls how fast )
- // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first
- // encoding pass to create the compressed output using the highest possible quality, and taking a
- // longer amount of time to encode.. ( speed setting ignored )
- int Mode; //
-
- // Key Framing Operations
- int auto_key; // automatically detect cut scenes and set the keyframes
- int key_freq; // maximum distance to key frame.
-
- int allow_lag; // allow lagged compression (if 0 lagin frames is ignored)
- int lag_in_frames; // how many frames lag before we start encoding
-
- // ----------------------------------------------------------------
- // DATARATE CONTROL OPTIONS
-
- int end_usage; // vbr or cbr
-
- // buffer targeting aggressiveness
- int under_shoot_pct;
- int over_shoot_pct;
-
- // buffering parameters
- int starting_buffer_level; // in seconds
- int optimal_buffer_level;
- int maximum_buffer_size;
-
- // controlling quality
- int fixed_q;
- int worst_allowed_q;
- int best_allowed_q;
- int cq_level;
- int lossless;
-
- // two pass datarate control
- int two_pass_vbrbias; // two pass datarate control tweaks
- int two_pass_vbrmin_section;
- int two_pass_vbrmax_section;
- // END DATARATE CONTROL OPTIONS
- // ----------------------------------------------------------------
-
-
- // these parameters aren't to be used in final build don't use!!!
- int play_alternate;
- int alt_freq;
-
- int encode_breakout; // early breakout encode threshold : for video conf recommend 800
-
- int arnr_max_frames;
- int arnr_strength;
- int arnr_type;
-
- struct vpx_fixed_buf two_pass_stats_in;
- struct vpx_codec_pkt_list *output_pkt_list;
-
- vp8e_tuning tuning;
- } VP9_CONFIG;
-
-
- void vp9_initialize_enc();
-
- VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
- void vp9_remove_compressor(VP9_PTR *comp);
-
- void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
-
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
- int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
- YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
- int64_t end_time_stamp);
-
- int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
- unsigned long *size, unsigned char *dest,
- int64_t *time_stamp, int64_t *time_end,
- int flush);
-
- int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
- vp9_ppflags_t *flags);
-
- int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
-
- int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
-
- int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd);
-
- int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd);
-
- int vp9_update_entropy(VP9_PTR comp, int update);
-
- int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
- unsigned int rows, unsigned int cols,
- int delta_q[4], int delta_lf[4],
- unsigned int threshold[4]);
-
- int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
- unsigned int rows, unsigned int cols);
-
- int vp9_set_internal_size(VP9_PTR comp,
- VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
-
- int vp9_get_quantizer(VP9_PTR c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // __INC_ONYX_H
--- a/vp8/common/onyxc_int.h
+++ /dev/null
@@ -1,314 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXC_INT_H
-#define __INC_ONYXC_INT_H
-
-#include "vpx_config.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_rtcd.h"
-#include "loopfilter.h"
-#include "entropymv.h"
-#include "entropy.h"
-#include "entropymode.h"
-#include "idct.h"
-#if CONFIG_POSTPROC
-#include "postproc.h"
-#endif
-
-/*#ifdef PACKET_TESTING*/
-#include "header.h"
-/*#endif*/
-
-/* Create/destroy static data structures. */
-
-void vp9_initialize_common(void);
-
-#define MINQ 0
-
-#define MAXQ 255
-#define QINDEX_BITS 8
-
-#define QINDEX_RANGE (MAXQ + 1)
-
-#define NUM_YV12_BUFFERS 4
-
-#define COMP_PRED_CONTEXTS 2
-
-typedef struct frame_contexts {
- vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
- vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
- vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
- vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
- vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
- vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
- vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
- nmv_context nmvc;
- nmv_context pre_nmvc;
- vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];
- vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
- vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
- vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];
- vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
- vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];
- unsigned int bmode_counts [VP9_BINTRAMODES];
- unsigned int ymode_counts [VP9_YMODES]; /* interframe intra mode probs */
- unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];
- unsigned int i8x8_mode_counts [VP9_I8X8_MODES]; /* interframe intra mode probs */
- unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];
- unsigned int mbsplit_counts [VP9_NUMMBSPLITS];
-
- vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
- vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
- vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
- unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
- unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
- unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
- unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
- unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
- unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
- [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
- nmv_context_counts NMVcount;
- vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
-
- int mode_context[6][4];
- int mode_context_a[6][4];
- int vp8_mode_contexts[6][4];
- int mv_ref_ct[6][4][2];
- int mv_ref_ct_a[6][4][2];
-} FRAME_CONTEXT;
-
-typedef enum {
- RECON_CLAMP_REQUIRED = 0,
- RECON_CLAMP_NOTREQUIRED = 1
-} CLAMP_TYPE;
-
-typedef enum {
- SINGLE_PREDICTION_ONLY = 0,
- COMP_PREDICTION_ONLY = 1,
- HYBRID_PREDICTION = 2,
- NB_PREDICTION_TYPES = 3,
-} COMPPREDMODE_TYPE;
-
-typedef enum {
- ONLY_4X4 = 0,
- ALLOW_8X8 = 1,
- ALLOW_16X16 = 2,
- TX_MODE_SELECT = 3,
- NB_TXFM_MODES = 4,
-} TXFM_MODE;
-
-typedef struct VP9_COMMON_RTCD {
-#if CONFIG_RUNTIME_CPU_DETECT
- vp9_idct_rtcd_vtable_t idct;
- vp9_subpix_rtcd_vtable_t subpix;
-#if CONFIG_POSTPROC
- vp9_postproc_rtcd_vtable_t postproc;
-#endif
- int flags;
-#else
- int unused;
-#endif
-} VP9_COMMON_RTCD;
-
-typedef struct VP9Common {
- struct vpx_internal_error_info error;
-
- DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
-
- int Width;
- int Height;
- int horiz_scale;
- int vert_scale;
-
- YUV_TYPE clr_type;
- CLAMP_TYPE clamp_type;
-
- YV12_BUFFER_CONFIG *frame_to_show;
-
- YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
- int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
- int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
-
- YV12_BUFFER_CONFIG post_proc_buffer;
- YV12_BUFFER_CONFIG temp_scale_frame;
-
-
- FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
- FRAME_TYPE frame_type;
-
- int show_frame;
-
- int frame_flags;
- int MBs;
- int mb_rows;
- int mb_cols;
- int mode_info_stride;
-
- /* profile settings */
- int experimental;
- int mb_no_coeff_skip;
- TXFM_MODE txfm_mode;
- COMPPREDMODE_TYPE comp_pred_mode;
- int no_lpf;
- int use_bilinear_mc_filter;
- int full_pixel;
-
- int base_qindex;
- int last_kf_gf_q; /* Q used on the last GF or KF */
-
- int y1dc_delta_q;
- int y2dc_delta_q;
- int y2ac_delta_q;
- int uvdc_delta_q;
- int uvac_delta_q;
-
- unsigned int frames_since_golden;
- unsigned int frames_till_alt_ref_frame;
-
- /* We allocate a MODE_INFO struct for each macroblock, together with
- an extra row on top and column on the left to simplify prediction. */
-
- MODE_INFO *mip; /* Base of allocated array */
- MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
- MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
- MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
-
-
- // Persistent mb segment id map used in prediction.
- unsigned char *last_frame_seg_map;
-
- INTERPOLATIONFILTERTYPE mcomp_filter_type;
- LOOPFILTERTYPE filter_type;
-
- loop_filter_info_n lf_info;
-
- int filter_level;
- int last_sharpness_level;
- int sharpness_level;
-
- int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
- int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
- int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
-
- int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
- int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
-
- int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
-
- int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
-
- /* Y,U,V,Y2 */
- ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
- ENTROPY_CONTEXT_PLANES left_context[2]; /* (up to) 4 contexts "" */
-
- /* keyframe block modes are predicted by their above, left neighbors */
-
- vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];
- vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
-#if CONFIG_SUPERBLOCKS
- vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
-#endif
- int kf_ymode_probs_index;
- int kf_ymode_probs_update;
- vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
-
- vp9_prob prob_intra_coded;
- vp9_prob prob_last_coded;
- vp9_prob prob_gf_coded;
-#if CONFIG_SUPERBLOCKS
- vp9_prob sb_coded;
-#endif
-
- // Context probabilities when using predictive coding of segment id
- vp9_prob segment_pred_probs[PREDICTION_PROBS];
- unsigned char temporal_update;
-
- // Context probabilities for reference frame prediction
- unsigned char ref_scores[MAX_REF_FRAMES];
- vp9_prob ref_pred_probs[PREDICTION_PROBS];
- vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];
-
- vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
-
- // FIXME contextualize
- vp9_prob prob_tx[TX_SIZE_MAX - 1];
-
- vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
-
- FRAME_CONTEXT lfc_a; /* last alt ref entropy */
- FRAME_CONTEXT lfc; /* last frame entropy */
- FRAME_CONTEXT fc; /* this frame entropy */
-
- // int mv_ref_ct[6][4][2];
- // int mv_ref_ct_a[6][4][2];
- // int mode_context[6][4];
- // int mode_context_a[6][4];
- // int vp8_mode_contexts[6][4];
-
- unsigned int current_video_frame;
- int near_boffset[3];
- int version;
-
-#ifdef PACKET_TESTING
- VP9_HEADER oh;
-#endif
- double bitrate;
- double framerate;
-
-#if CONFIG_RUNTIME_CPU_DETECT
- VP9_COMMON_RTCD rtcd;
-#endif
-
-#if CONFIG_POSTPROC
- struct postproc_state postproc_state;
-#endif
-
-#if CONFIG_PRED_FILTER
- /* Prediction filter variables */
- int pred_filter_mode; // 0=disabled at the frame level (no MB filtered)
- // 1=enabled at the frame level (all MB filtered)
- // 2=specified per MB (1=filtered, 0=non-filtered)
- vp9_prob prob_pred_filter_off;
-#endif
-
-} VP9_COMMON;
-
-#endif // __INC_ONYX_INT_H
--- a/vp8/common/onyxd.h
+++ /dev/null
@@ -1,68 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXD_H
-#define __INC_ONYXD_H
-
-
-/* Create/destroy static data structures. */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "type_aliases.h"
-#include "vpx_scale/yv12config.h"
-#include "ppflags.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_codec.h"
-
- typedef void *VP9D_PTR;
- typedef struct {
- int Width;
- int Height;
- int Version;
- int postprocess;
- int max_threads;
- int input_partition;
- } VP9D_CONFIG;
- typedef enum {
- VP9_LAST_FLAG = 1,
- VP9_GOLD_FLAG = 2,
- VP9_ALT_FLAG = 4
- } VP9_REFFRAME;
-
- void vp9_initialize_dec(void);
-
- int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
- const unsigned char *dest,
- int64_t time_stamp);
-
- int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
- int64_t *time_stamp, int64_t *time_end_stamp,
- vp9_ppflags_t *flags);
-
- vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
- VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd);
-
- vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
- VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd);
-
- VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
-
- void vp9_remove_decompressor(VP9D_PTR comp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // __INC_ONYXD_H
--- a/vp8/common/postproc.c
+++ /dev/null
@@ -1,1035 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
-#include "postproc.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "systemdependent.h"
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#define RGB_TO_YUV(t) \
- ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \
- (0.098*(float)(t & 0xff)) + 16), \
- (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \
- (0.439*(float)(t & 0xff)) + 128), \
- ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \
- (0.071*(float)(t & 0xff)) + 128)
-
-/* global constants */
-#if CONFIG_POSTPROC_VISUALIZER
-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
- { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
- { RGB_TO_YUV(0x00FF00) }, /* Green */
- { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
- { RGB_TO_YUV(0x006400) }, /* DarkGreen */
- { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
- { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
- { RGB_TO_YUV(0x00008B) }, /* Dark blue */
- { RGB_TO_YUV(0x551A8B) }, /* Purple */
- { RGB_TO_YUV(0xFF0000) } /* Red */
- { RGB_TO_YUV(0xCC33FF) }, /* Magenta */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
- { RGB_TO_YUV(0x6633ff) }, /* Purple */
- { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
- { RGB_TO_YUV(0xff33cc) }, /* Pink */
- { RGB_TO_YUV(0xff3366) }, /* Coral */
- { RGB_TO_YUV(0x3366ff) }, /* Blue */
- { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
- { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
- { RGB_TO_YUV(0xff6633) }, /* Orange */
- { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
- { RGB_TO_YUV(0x8ab800) }, /* Green */
- { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
- { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
- { RGB_TO_YUV(0x66ff33) }, /* Light Green */
- { RGB_TO_YUV(0xccff33) }, /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
- { RGB_TO_YUV(0x00ff00) }, /* Blue */
- { RGB_TO_YUV(0x0000ff) }, /* Green */
- { RGB_TO_YUV(0xffff00) }, /* Yellow */
- { RGB_TO_YUV(0xff0000) }, /* Red */
-};
-#endif
-
-static const short kernel5[] = {
- 1, 1, 4, 1, 1
-};
-
-const short vp9_rv[] = {
- 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
- 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
- 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
- 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
- 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
- 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
- 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
- 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
- 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
- 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
- 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
- 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
- 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
- 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
- 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
- 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
- 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
- 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
- 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
- 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
- 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
- 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
- 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
- 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
- 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
- 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
- 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
- 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
- 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
-};
-
-
-extern void vp9_blit_text(const char *msg, unsigned char *address,
- const int pitch);
-extern void vp9_blit_line(int x0, int x1, int y0, int y1,
- unsigned char *image, const int pitch);
-/****************************************************************************
- */
-void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit) {
- unsigned char *p_src, *p_dst;
- int row;
- int col;
- int i;
- int v;
- int pitch = src_pixels_per_line;
- unsigned char d[8];
- (void)dst_pixels_per_line;
-
- for (row = 0; row < rows; row++) {
- /* post_proc_down for one row */
- p_src = src_ptr;
- p_dst = dst_ptr;
-
- for (col = 0; col < cols; col++) {
-
- int kernel = 4;
- int v = p_src[col];
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - p_src[col + i * pitch]) > flimit)
- goto down_skip_convolve;
-
- kernel += kernel5[2 + i] * p_src[col + i * pitch];
- }
-
- v = (kernel >> 3);
- down_skip_convolve:
- p_dst[col] = v;
- }
-
- /* now post_proc_across */
- p_src = dst_ptr;
- p_dst = dst_ptr;
-
- for (i = 0; i < 8; i++)
- d[i] = p_src[i];
-
- for (col = 0; col < cols; col++) {
- int kernel = 4;
- v = p_src[col];
-
- d[col & 7] = v;
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - p_src[col + i]) > flimit)
- goto across_skip_convolve;
-
- kernel += kernel5[2 + i] * p_src[col + i];
- }
-
- d[col & 7] = (kernel >> 3);
- across_skip_convolve:
-
- if (col >= 2)
- p_dst[col - 2] = d[(col - 2) & 7];
- }
-
- /* handle the last two pixels */
- p_dst[col - 2] = d[(col - 2) & 7];
- p_dst[col - 1] = d[(col - 1) & 7];
-
-
- /* next row */
- src_ptr += pitch;
- dst_ptr += pitch;
- }
-}
-
-static int q2mbl(int x) {
- if (x < 20) x = 20;
-
- x = 50 + (x - 50) * 10 / 8;
- return x * x / 3;
-}
-
-void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
- int rows, int cols, int flimit) {
- int r, c, i;
-
- unsigned char *s = src;
- unsigned char d[16];
-
-
- for (r = 0; r < rows; r++) {
- int sumsq = 0;
- int sum = 0;
-
- for (i = -8; i <= 6; i++) {
- sumsq += s[i] * s[i];
- sum += s[i];
- d[i + 8] = 0;
- }
-
- for (c = 0; c < cols + 8; c++) {
- int x = s[c + 7] - s[c - 8];
- int y = s[c + 7] + s[c - 8];
-
- sum += x;
- sumsq += x * y;
-
- d[c & 15] = s[c];
-
- if (sumsq * 15 - sum * sum < flimit) {
- d[c & 15] = (8 + sum + s[c]) >> 4;
- }
-
- s[c - 8] = d[(c - 8) & 15];
- }
-
- s += pitch;
- }
-}
-
-void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
- int rows, int cols, int flimit) {
- int r, c, i;
- const short *rv3 = &vp9_rv[63 & rand()];
-
- for (c = 0; c < cols; c++) {
- unsigned char *s = &dst[c];
- int sumsq = 0;
- int sum = 0;
- unsigned char d[16];
- const short *rv2 = rv3 + ((c * 17) & 127);
-
- for (i = -8; i <= 6; i++) {
- sumsq += s[i * pitch] * s[i * pitch];
- sum += s[i * pitch];
- }
-
- for (r = 0; r < rows + 8; r++) {
- sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
- sum += s[7 * pitch] - s[-8 * pitch];
- d[r & 15] = s[0];
-
- if (sumsq * 15 - sum * sum < flimit) {
- d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
- }
-
- s[-8 * pitch] = d[(r - 8) & 15];
- s += pitch;
- }
- }
-}
-
-static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag,
- vp9_postproc_rtcd_vtable_t *rtcd) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) low_var_thresh;
- (void) flag;
-
- POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
- source->y_stride, post->y_stride,
- source->y_height, source->y_width, ppl);
- POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,
- post->y_height, post->y_width, q2mbl(q));
- POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,
- post->y_height, post->y_width, q2mbl(q));
-
- POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
- POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
-}
-
-void vp9_deblock(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag,
- vp9_postproc_rtcd_vtable_t *rtcd) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) low_var_thresh;
- (void) flag;
-
- POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
- source->y_stride, post->y_stride,
- source->y_height, source->y_width, ppl);
- POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
- POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
- source->uv_stride, post->uv_stride,
- source->uv_height, source->uv_width, ppl);
-}
-
-void vp9_de_noise(YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag,
- vp9_postproc_rtcd_vtable_t *rtcd) {
- double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
- int ppl = (int)(level + .5);
- (void) post;
- (void) low_var_thresh;
- (void) flag;
-
- POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,
- src->y_buffer + 2 * src->y_stride + 2,
- src->y_stride,
- src->y_stride,
- src->y_height - 4,
- src->y_width - 4,
- ppl);
- POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,
- src->u_buffer + 2 * src->uv_stride + 2,
- src->uv_stride,
- src->uv_stride,
- src->uv_height - 4,
- src->uv_width - 4, ppl);
- POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,
- src->v_buffer + 2 * src->uv_stride + 2,
- src->uv_stride,
- src->uv_stride,
- src->uv_height - 4,
- src->uv_width - 4, ppl);
-}
-
-double vp9_gaussian(double sigma, double mu, double x) {
- return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
- (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
-}
-
-static void fillrd(struct postproc_state *state, int q, int a) {
- char char_dist[300];
-
- double sigma;
- int ai = a, qi = q, i;
-
- vp9_clear_system_state();
-
- sigma = ai + .5 + .6 * (63 - qi) / 63.0;
-
- /* set up a lookup table of 256 entries that matches
- * a gaussian distribution with sigma determined by q.
- */
- {
- double i;
- int next, j;
-
- next = 0;
-
- for (i = -32; i < 32; i++) {
- int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
-
- if (a) {
- for (j = 0; j < a; j++) {
- char_dist[next + j] = (char) i;
- }
-
- next = next + j;
- }
-
- }
-
- for (next = next; next < 256; next++)
- char_dist[next] = 0;
- }
-
- for (i = 0; i < 3072; i++) {
- state->noise[i] = char_dist[rand() & 0xff];
- }
-
- for (i = 0; i < 16; i++) {
- state->blackclamp[i] = -char_dist[0];
- state->whiteclamp[i] = -char_dist[0];
- state->bothclamp[i] = -2 * char_dist[0];
- }
-
- state->last_q = q;
- state->last_noise = a;
-}
-
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_c
- *
- * INPUTS : unsigned char *Start starting address of buffer to
- * add gaussian noise to
- * unsigned int Width width of plane
- * unsigned int Height height of plane
- * int Pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
- char blackclamp[16],
- char whiteclamp[16],
- char bothclamp[16],
- unsigned int Width, unsigned int Height, int Pitch) {
- unsigned int i, j;
-
- for (i = 0; i < Height; i++) {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = (char *)(noise + (rand() & 0xff));
-
- for (j = 0; j < Width; j++) {
- if (Pos[j] < blackclamp[0])
- Pos[j] = blackclamp[0];
-
- if (Pos[j] > 255 + whiteclamp[0])
- Pos[j] = 255 + whiteclamp[0];
-
- Pos[j] += Ref[j];
- }
- }
-}
-
-/* Blend the macro block with a solid colored square. Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- y += 2 * stride + 2;
- for (i = 0; i < 12; i++) {
- for (j = 0; j < 12; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- u += stride + 1;
- v += stride + 1;
-
- for (i = 0; i < 6; i++) {
- for (j = 0; j < 6; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
- }
-}
-
-/* Blend only the edge of the macro block. Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 16; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- for (i = 0; i < 12; i++) {
- y[0] = (y[0] * alpha + y1_const) >> 16;
- y[1] = (y[1] * alpha + y1_const) >> 16;
- y[14] = (y[14] * alpha + y1_const) >> 16;
- y[15] = (y[15] * alpha + y1_const) >> 16;
- y += stride;
- }
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 16; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- for (j = 0; j < 8; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
-
- for (i = 0; i < 6; i++) {
- u[0] = (u[0] * alpha + u1_const) >> 16;
- v[0] = (v[0] * alpha + v1_const) >> 16;
-
- u[7] = (u[7] * alpha + u1_const) >> 16;
- v[7] = (v[7] * alpha + v1_const) >> 16;
-
- u += stride;
- v += stride;
- }
-
- for (j = 0; j < 8; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
-}
-
-void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
- }
-}
-
-static void constrain_line(int x0, int *x1, int y0, int *y1,
- int width, int height) {
- int dx;
- int dy;
-
- if (*x1 > width) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *x1 = width;
- if (dx)
- *y1 = ((width - x0) * dy) / dx + y0;
- }
- if (*x1 < 0) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *x1 = 0;
- if (dx)
- *y1 = ((0 - x0) * dy) / dx + y0;
- }
- if (*y1 > height) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *y1 = height;
- if (dy)
- *x1 = ((height - y0) * dx) / dy + x0;
- }
- if (*y1 < 0) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *y1 = 0;
- if (dy)
- *x1 = ((0 - y0) * dx) / dy + x0;
- }
-}
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
-#else
-#define RTCD_VTABLE(oci) NULL
-#endif
-
-int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
- vp9_ppflags_t *ppflags) {
- int q = oci->filter_level * 10 / 6;
- int flags = ppflags->post_proc_flag;
- int deblock_level = ppflags->deblocking_level;
- int noise_level = ppflags->noise_level;
-
- if (!oci->frame_to_show)
- return -1;
-
- if (q > 63)
- q = 63;
-
- if (!flags) {
- *dest = *oci->frame_to_show;
-
- /* handle problem with extending borders */
- dest->y_width = oci->Width;
- dest->y_height = oci->Height;
- dest->uv_height = dest->y_height / 2;
- return 0;
-
- }
-
-#if ARCH_X86||ARCH_X86_64
- vpx_reset_mmx_state();
-#endif
-
- if (flags & VP9D_DEMACROBLOCK) {
- deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
- q + (deblock_level - 5) * 10, 1, 0,
- RTCD_VTABLE(oci));
- } else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,
- q, 1, 0, RTCD_VTABLE(oci));
- } else {
- vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
- }
-
- if (flags & VP9D_ADDNOISE) {
- if (oci->postproc_state.last_q != q
- || oci->postproc_state.last_noise != noise_level) {
- fillrd(&oci->postproc_state, 63 - q, noise_level);
- }
-
- POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,
- oci->postproc_state.noise,
- oci->postproc_state.blackclamp,
- oci->postproc_state.whiteclamp,
- oci->postproc_state.bothclamp,
- oci->post_proc_buffer.y_width,
- oci->post_proc_buffer.y_height,
- oci->post_proc_buffer.y_stride);
- }
-
-#if CONFIG_POSTPROC_VISUALIZER
- if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
- char message[512];
- sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
- (oci->frame_type == KEY_FRAME),
- oci->refresh_golden_frame,
- oci->base_qindex,
- oci->filter_level,
- flags,
- oci->mb_cols, oci->mb_rows);
- vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
- }
-
- if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
- int i, j;
- unsigned char *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = oci->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp9_filter each macro block */
- for (i = 0; i < mb_rows; i++) {
- for (j = 0; j < mb_cols; j++) {
- char zz[4];
-
- sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
-
- vp9_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
-
- }
- }
-
- if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
- int i, j;
- unsigned char *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = oci->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp9_filter each macro block */
- for (i = 0; i < mb_rows; i++) {
- for (j = 0; j < mb_cols; j++) {
- char zz[4];
- int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
- mi[mb_index].mbmi.mode != SPLITMV &&
- mi[mb_index].mbmi.mb_skip_coeff);
-
- if (oci->frame_type == KEY_FRAME)
- sprintf(zz, "a");
- else
- sprintf(zz, "%c", dc_diff + '0');
-
- vp9_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
-
- }
- }
-
- if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
- char message[512];
- snprintf(message, sizeof(message),
- "Bitrate: %10.2f frame_rate: %10.2f ",
- oci->bitrate, oci->framerate);
- vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
- }
-
- /* Draw motion vectors */
- if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
- int x0, y0;
-
- for (y0 = 0; y0 < height; y0 += 16) {
- for (x0 = 0; x0 < width; x0 += 16) {
- int x1, y1;
-
- if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
- mi++;
- continue;
- }
-
- if (mi->mbmi.mode == SPLITMV) {
- switch (mi->mbmi.partitioning) {
- case PARTITIONING_16X8 : { /* mv_top_bottom */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride);
-
- break;
- }
- case PARTITIONING_8X16 : { /* mv_left_right */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride);
-
- break;
- }
- case PARTITIONING_8X8 : { /* mv_quarters */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[10];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride);
- break;
- }
- case PARTITIONING_4X4:
- default : {
- union b_mode_info *bmi = mi->bmi;
- int bx0, by0;
-
- for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
- for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
- MV *mv = &bmi->mv.as_mv;
-
- x1 = bx0 + 2 + (mv->col >> 3);
- y1 = by0 + 2 + (mv->row >> 3);
-
- constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
- vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride);
-
- bmi++;
- }
- }
- }
- }
- } else if (mi->mbmi.mode >= NEARESTMV) {
- MV *mv = &mi->mbmi.mv.as_mv;
- const int lx0 = x0 + 8;
- const int ly0 = y0 + 8;
-
- x1 = lx0 + (mv->col >> 3);
- y1 = ly0 + (mv->row >> 3);
-
- if (x1 != lx0 && y1 != ly0) {
- constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
- vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride);
-
- constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
- vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
- } else
- vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
- }
-
- mi++;
- }
- mi++;
- }
- }
-
- /* Color in block modes */
- if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
- && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
- int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
- unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
- unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (mi->mbmi.mode == B_PRED &&
- ((ppflags->display_mb_modes_flag & B_PRED) ||
- ppflags->display_b_modes_flag)) {
- int by, bx;
- unsigned char *yl, *ul, *vl;
- union b_mode_info *bmi = mi->bmi;
-
- yl = y_ptr + x;
- ul = u_ptr + (x >> 1);
- vl = v_ptr + (x >> 1);
-
- for (by = 0; by < 16; by += 4) {
- for (bx = 0; bx < 16; bx += 4) {
- if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
- || (ppflags->display_mb_modes_flag & B_PRED)) {
- Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
- U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
- V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
-
- POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,
- ul + (bx >> 1),
- vl + (bx >> 1),
- Y, U, V,
- 0xc000, y_stride);
- }
- bmi++;
- }
-
- yl += y_stride * 4;
- ul += y_stride * 1;
- vl += y_stride * 1;
- }
- } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
- Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
- U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
- V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
- POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,
- u_ptr + (x >> 1),
- v_ptr + (x >> 1),
- Y, U, V,
- 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-
- /* Color in frame reference blocks */
- if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
- ppflags->display_ref_frame_flag) {
- int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int width = post->y_width;
- int height = post->y_height;
- unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
- unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
- unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
- Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
- U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
- V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
- POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,
- u_ptr + (x >> 1),
- v_ptr + (x >> 1),
- Y, U, V,
- 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-#endif
-
- *dest = oci->post_proc_buffer;
-
- /* handle problem with extending borders */
- dest->y_width = oci->Width;
- dest->y_height = oci->Height;
- dest->uv_height = dest->y_height / 2;
-
- return 0;
-}
--- a/vp8/common/postproc.h
+++ /dev/null
@@ -1,128 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef POSTPROC_H
-#define POSTPROC_H
-
-#define prototype_postproc_inplace(sym)\
- void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-
-#define prototype_postproc(sym)\
- void sym(unsigned char *src, unsigned char *dst, int src_pitch, \
- int dst_pitch, int rows, int cols, int flimit)
-
-#define prototype_postproc_addnoise(sym) \
- void sym(unsigned char *s, char *noise, char blackclamp[16], \
- char whiteclamp[16], char bothclamp[16], \
- unsigned int w, unsigned int h, int pitch)
-
-#define prototype_postproc_blend_mb_inner(sym)\
- void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
- int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_mb_outer(sym)\
- void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
- int y1, int u1, int v1, int alpha, int stride)
-
-#define prototype_postproc_blend_b(sym)\
- void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
- int y1, int u1, int v1, int alpha, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/postproc_x86.h"
-#endif
-
-#ifndef vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_c
-#endif
-extern prototype_postproc_inplace(vp9_postproc_down);
-
-#ifndef vp9_postproc_across
-#define vp9_postproc_across vp9_mbpost_proc_across_ip_c
-#endif
-extern prototype_postproc_inplace(vp9_postproc_across);
-
-#ifndef vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_c
-#endif
-extern prototype_postproc(vp9_postproc_downacross);
-
-#ifndef vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_c
-#endif
-extern prototype_postproc_addnoise(vp9_postproc_addnoise);
-
-#ifndef vp9_postproc_blend_mb_inner
-#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c
-#endif
-extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);
-
-#ifndef vp9_postproc_blend_mb_outer
-#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c
-#endif
-extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);
-
-#ifndef vp9_postproc_blend_b
-#define vp9_postproc_blend_b vp9_blend_b_c
-#endif
-extern prototype_postproc_blend_b(vp9_postproc_blend_b);
-
-typedef prototype_postproc((*vp9_postproc_fn_t));
-typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));
-typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));
-typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));
-typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));
-typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));
-typedef struct {
- vp9_postproc_inplace_fn_t down;
- vp9_postproc_inplace_fn_t across;
- vp9_postproc_fn_t downacross;
- vp9_postproc_addnoise_fn_t addnoise;
- vp9_postproc_blend_mb_inner_fn_t blend_mb_inner;
- vp9_postproc_blend_mb_outer_fn_t blend_mb_outer;
- vp9_postproc_blend_b_fn_t blend_b;
-} vp9_postproc_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn
-#endif
-
-#include "vpx_ports/mem.h"
-struct postproc_state {
- int last_q;
- int last_noise;
- char noise[3072];
- DECLARE_ALIGNED(16, char, blackclamp[16]);
- DECLARE_ALIGNED(16, char, whiteclamp[16]);
- DECLARE_ALIGNED(16, char, bothclamp[16]);
-};
-#include "onyxc_int.h"
-#include "ppflags.h"
-int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
- vp9_ppflags_t *flags);
-
-
-void vp9_de_noise(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag,
- vp9_postproc_rtcd_vtable_t *rtcd);
-
-void vp9_deblock(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int q,
- int low_var_thresh,
- int flag,
- vp9_postproc_rtcd_vtable_t *rtcd);
-#endif
--- a/vp8/common/ppc/copy_altivec.asm
+++ /dev/null
@@ -1,47 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;# but the output will be. So two reads and a perm
-;# for the input, but only one store for the output.
-copy_mem16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xe000
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
-cp_16x16_loop:
- lvsl v0, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v1, v1, v2, v0
-
- stvx v1, 0, r5
-
- add r3, r3, r4 ;# increment source pointer
- add r5, r5, r6 ;# increment destination pointer
-
- bdnz cp_16x16_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
--- a/vp8/common/ppc/filter_altivec.asm
+++ /dev/null
@@ -1,1013 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl sixtap_predict_ppc
- .globl sixtap_predict8x4_ppc
- .globl sixtap_predict8x8_ppc
- .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
- load_c \V0, HFilter, r5, r9, r10
-
- addi r5, r5, 16
- lvx \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
- load_c v0, VFilter, r6, r3, r10
-
- vspltish v5, 8
- vspltish v6, 3
- vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v1, v0, 1
- vspltb v2, v0, 2
- vspltb v3, v0, 3
- vspltb v4, v0, 4
- vspltb v5, v0, 5
- vspltb v0, v0, 0
-.endm
-
-.macro vpre_load
- Vprolog
- li r10, 16
- lvx v10, 0, r9 ;# v10..v14 = first 5 rows
- lvx v11, r10, r9
- addi r9, r9, 32
- lvx v12, 0, r9
- lvx v13, r10, r9
- addi r9, r9, 32
- lvx v14, 0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
- ;# (Re,Ro) += (V*T)
- vmuleub \TMP, \V, \T ;# trashes v8
- vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
- vmuloub \TMP, \V, \T
- vadduhm \Ro, \Ro, \TMP ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
- vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
- vadduhm v16, v6, v8
- vmuloub v8, \P0, v0
- vadduhm v17, v6, v8
- Msum v16, v17, \P2, v2, v8
- Msum v16, v17, \P3, v3, v8
- Msum v16, v17, \P5, v5, v8
-
- vmuleub v18, \P1, v1 ;# 2 negative taps
- vmuloub v19, \P1, v1
- Msum v18, v19, \P4, v4, v8
-
- vsubuhs v16, v16, v18 ;# subtract neg from pos
- vsubuhs v17, v17, v19
- vsrh v16, v16, v7 ;# divide by 128
- vsrh v17, v17, v7 ;# v16 v17 = evens, odds
- vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
- vmrglh v19, v16, v17
- vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
- vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
- vadduhm v21, v20, v24
- vmuloub v24, \P0, v13
- vadduhm v22, v20, v24
- Msum v21, v22, \P2, v15, v25
- Msum v21, v22, \P3, v16, v25
- Msum v21, v22, \P5, v18, v25
-
- vmuleub v23, \P1, v14 ;# 2 negative taps
- vmuloub v24, \P1, v14
- Msum v23, v24, \P4, v17, v25
-
- vsubuhs v21, v21, v23 ;# subtract neg from pos
- vsubuhs v22, v22, v24
- vsrh v21, v21, v19 ;# divide by 128
- vsrh v22, v22, v19 ;# v16 v17 = evens, odds
- vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
- vmrglh v24, v21, v22
- vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
- vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
- stvx \P0, 0, r7
- add r7, r7, r8 ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
- addi r9, r9, 16 ;# P5 = newest input row
- lvx \P5, 0, r9
- Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
- luma_v v10, v11, v12, v13, v14, v15
- luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
- luma_vtwo
- luma_v v12, v13, v14, v15, v10, v11
- luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
- luma_vfour
- luma_v v14, v15, v10, v11, v12, v13
- luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
- vmsummbm \R, v13, \I, v15
- vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx \VD, 0, \RS
- lvx v20, r10, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
- vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
- vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
- Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
- vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
- Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
-
- vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
- vsrh \R, \R, v19
-
- vpkuhus \R, \R, \R ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v20, 0, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, v20, v20, v21
-.endm
- .text
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xff87
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- vertical_only_4x4
-
- ;# load up horizontal filter
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_4x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_4x4
-
-vertical_only_4x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_4x4:
- load_c v20, b_hilo_4x4, 0, r9, r10
- load_c v21, b_hilo, 0, r9, r10
-
- ;# reposition input so that it can go through the
- ;# filtering phase with one pass.
- vperm v0, v0, v1, v20 ;# 0 1 x x
- vperm v2, v2, v3, v20 ;# 2 3 x x
- vperm v4, v4, v5, v20 ;# 4 5 x x
- vperm v6, v6, v7, v20 ;# 6 7 x x
-
- vperm v0, v0, v2, v21 ;# 0 1 2 3
- vperm v4, v4, v6, v21 ;# 4 5 6 7
-
- vsldoi v1, v0, v4, 4
- vsldoi v2, v0, v4, 8
- vsldoi v3, v0, v4, 12
-
- vsldoi v5, v4, v8, 4
-
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
- stvx v0, 0, r1
-
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 4(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 8(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 12(r1)
- stw r0, 0(r7)
-
- b exit_4x4
-
-store_4x4:
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v4, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v5, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x4
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_8x4
-
-second_pass_pre_copy_8x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x4:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
- b exit_8x4
-
-store_8x4:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x4
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
-
- b exit_8x4
-
-store_aligned2_8x4:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;# register there is no need to loop. Everything can stay in registers.
-sixtap_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x8
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
- interp_8x8 v9
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x8
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v10
- interp_8x8 v11
- interp_8x8 v12
-
- b second_pass_8x8
-
-second_pass_pre_copy_8x8:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x8:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
- vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
- vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
- vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
- vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
- b exit_8x8
-
-store_8x8:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x8
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
- w_8x8 v8, r7, r0, r8
- w_8x8 v9, r7, r0, r8
-
- b exit_8x8
-
-store_aligned2_8x8:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
- vperm v8, v8, v9, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
- addi r7, r7, 16
- stvx v8, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
-;# edges. One of the filters can be null, but both won't be. Needs to use a
-;# temporary buffer because the source buffer can't be modified and the buffer
-;# for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-416(r1) ;# create space on the stack
-
- ;# Three possiblities
- ;# 1. First filter is null. Don't use a temp buffer.
- ;# 2. Second filter is null. Don't use a temp buffer.
- ;# 3. Neither are null, use temp buffer.
-
- ;# First Pass (horizontal edge)
- ;# setup pointers for src
- ;# if possiblity (1) then setup the src pointer to be the orginal and jump
- ;# to second pass. this is based on if x_offset is 0.
-
- ;# load up horizontal filter
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- load_hfilter v4, v5
-
- beq- copy_horizontal_16x21
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v14, b_hperm, 0, r9, r10
-
- ;# These statements are guessing that there won't be a second pass,
- ;# but if there is then inside the bypass they need to be set
- li r0, 16 ;# prepare for no vertical filter
-
- ;# Change the output pointer and pitch to be the actual
- ;# desination instead of a temporary buffer.
- addi r9, r7, 0
- addi r5, r8, 0
-
- ;# no vertical filter, so write the output from the first pass
- ;# directly into the output buffer.
- beq- no_vertical_filter_bypass
-
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# setup counter for the number of lines that are going to be filtered
- li r0, 21
-
- ;# use the stack as temporary storage
- la r9, 48(r1)
- li r5, 16
-
-no_vertical_filter_bypass:
-
- mtctr r0
-
- ;# rounding added in on the multiply
- vspltisw v10, 8
- vspltisw v12, 3
- vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v13, 7
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
-horizontal_loop_16x16:
-
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v1, 0, r3
- lvx v2, r10, r3
- lvx v3, r12, r3
-
- vperm v8, v1, v2, v15
- vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
-
- vsldoi v11, v8, v9, 4
-
- ;# set 0
- vmsummbm v6, v4, v8, v12 ;# taps times elements
- vmsummbm v0, v5, v11, v6
-
- ;# set 1
- vsldoi v10, v8, v9, 1
- vsldoi v11, v8, v9, 5
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v1, v5, v11, v6
-
- ;# set 2
- vsldoi v10, v8, v9, 2
- vsldoi v11, v8, v9, 6
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v2, v5, v11, v6
-
- ;# set 3
- vsldoi v10, v8, v9, 3
- vsldoi v11, v8, v9, 7
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v3, v5, v11, v6
-
- vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
-
- vsrh v0, v0, v13 ;# divide v0, v1 by 128
- vsrh v1, v1, v13
-
- vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
- vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
-
- stvx v0, 0, r9
- add r9, r9, r5
-
- add r3, r3, r4
-
- bdnz horizontal_loop_16x16
-
- ;# check again to see if vertical filter needs to be done.
- cmpi cr0, r6, 0
- beq cr0, end_16x16
-
- ;# yes there is, so go to the second pass
- b second_pass_16x16
-
-copy_horizontal_16x21:
- li r10, 21
- mtctr r10
-
- li r10, 16
-
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# this is done above if there is a horizontal filter,
- ;# if not it needs to be done down here.
- slwi r6, r6, 4 ;# index into vertical filter array
-
- ;# always write to the stack when doing a horizontal copy
- la r9, 48(r1)
-
-copy_horizontal_loop_16x21:
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v8, v1, v2, v15
-
- stvx v8, 0, r9
- addi r9, r9, 16
-
- add r3, r3, r4
-
- bdnz copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
- ;# always read from the stack when doing a vertical filter
- la r9, 48(r1)
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v7, 7
-
- vpre_load
-
- luma_vsix
- luma_vsix
- luma_vfour
-
-end_16x16:
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-HFilter:
- .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
- .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
- .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
- .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
- .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
- .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
- .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
- .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
- .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
- .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
- .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
- .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
-
- .align 4
-VFilter:
- .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
- .align 4
-b_hperm:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-B_0123:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-B_4567:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
- .align 4
-B_89AB:
- .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
- .align 4
-b_hilo:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-b_hilo_4x4:
- .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
--- a/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl bilinear_predict4x4_ppc
- .globl bilinear_predict8x4_ppc
- .globl bilinear_predict8x8_ppc
- .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r9, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r9, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r9, r0
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro HFilter V
- vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_4x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_4x4_b:
-
- stvx v0, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v1, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_8x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_8x4_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
- hfilter_8 v4, 1
- hfilter_8 v5, 1
- hfilter_8 v6, 1
- hfilter_8 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x8_b
-
- hfilter_8 v8, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
- load_and_align_8 v5, 1
- load_and_align_8 v6, 1
- load_and_align_8 v7, 1
- load_and_align_8 v8, 0
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-store_out_8x8_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, 1
- load_and_align_16 v1, 1
- load_and_align_16 v2, 1
- load_and_align_16 v3, 1
- load_and_align_16 v4, 1
- load_and_align_16 v5, 1
- load_and_align_16 v6, 1
- load_and_align_16 v7, 1
- load_and_align_16 v8, 1
- load_and_align_16 v9, 1
- load_and_align_16 v10, 1
- load_and_align_16 v11, 1
- load_and_align_16 v12, 1
- load_and_align_16 v13, 1
- load_and_align_16 v14, 1
- load_and_align_16 v15, 1
- load_and_align_16 v16, 0
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-store_out_16x16_b:
-
- write_16 v0, 1
- write_16 v1, 1
- write_16 v2, 1
- write_16 v3, 1
- write_16 v4, 1
- write_16 v5, 1
- write_16 v6, 1
- write_16 v7, 1
- write_16 v8, 1
- write_16 v9, 1
- write_16 v10, 1
- write_16 v11, 1
- write_16 v12, 1
- write_16 v13, 1
- write_16 v14, 1
- write_16 v15, 0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null
@@ -1,189 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
- .align 2
-short_idct4x4llm_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- load_c v8, sinpi8sqrt2, 0, r9, r10
- load_c v9, cospi8sqrt2minus1, 0, r9, r10
- load_c v10, hi_hi, 0, r9, r10
- load_c v11, lo_lo, 0, r9, r10
- load_c v12, shift_16, 0, r9, r10
-
- li r10, 16
- lvx v0, 0, r3 ;# input ip[0], ip[ 4]
- lvx v1, r10, r3 ;# input ip[8], ip[12]
-
- ;# first pass
- vupkhsh v2, v0
- vupkhsh v3, v1
- vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
-
- vupklsh v0, v0
- vmulosh v4, v0, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vupklsh v1, v1
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v3, v1, v8
- vsraw v3, v3, v12
- vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v0, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v0
-
- vaddsws v3, v3, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- ;# transpose input
- vmrghw v4, v0, v1 ;# a0 b0 a1 b1
- vmrghw v5, v2, v3 ;# c0 d0 c1 d1
-
- vmrglw v6, v0, v1 ;# a2 b2 a3 b3
- vmrglw v7, v2, v3 ;# c2 d2 c3 d3
-
- vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
- vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
-
- vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
- vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
-
- ;# second pass
- vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
-
- vmulosh v4, v1, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v3, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v3
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v2, v3, v8
- vsraw v2, v2, v12
- vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vaddsws v3, v2, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- vspltish v6, 4
- vspltish v7, 3
-
- vpkswss v0, v0, v1
- vpkswss v1, v2, v3
-
- vaddshs v0, v0, v6
- vaddshs v1, v1, v6
-
- vsrah v0, v0, v7
- vsrah v1, v1, v7
-
- ;# transpose output
- vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
- vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
-
- vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
- vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
-
- stwu r1,-416(r1) ;# create space on the stack
-
- stvx v0, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- stvx v1, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 4
-sinpi8sqrt2:
- .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
- .align 4
-cospi8sqrt2minus1:
- .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
- .align 4
-shift_16:
- .long 16, 16, 16, 16
-
- .align 4
-hi_hi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-lo_lo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null
@@ -1,127 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
- unsigned char *u, // source pointer
- unsigned char *v, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- // These should all be done at once with one call, instead of 3
- loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi) {
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
--- a/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl mbloop_filter_horizontal_edge_y_ppc
- .globl loop_filter_horizontal_edge_y_ppc
- .globl mbloop_filter_vertical_edge_y_ppc
- .globl loop_filter_vertical_edge_y_ppc
-
- .globl mbloop_filter_horizontal_edge_uv_ppc
- .globl loop_filter_horizontal_edge_uv_ppc
- .globl mbloop_filter_vertical_edge_uv_ppc
- .globl loop_filter_vertical_edge_uv_ppc
-
- .globl loop_filter_simple_horizontal_edge_ppc
- .globl loop_filter_simple_vertical_edge_ppc
-
- .text
-;# We often need to perform transposes (and other transpose-like operations)
-;# on matrices of data. This is simplified by the fact that we usually
-;# operate on hunks of data whose dimensions are powers of 2, or at least
-;# divisible by highish powers of 2.
-;#
-;# These operations can be very confusing. They become more straightforward
-;# when we think of them as permutations of address bits: Concatenate a
-;# group of vector registers and think of it as occupying a block of
-;# memory beginning at address zero. The low four bits 0...3 of the
-;# address then correspond to position within a register, the higher-order
-;# address bits select the register.
-;#
-;# Although register selection, at the code level, is arbitrary, things
-;# are simpler if we use contiguous ranges of register numbers, simpler
-;# still if the low-order bits of the register number correspond to
-;# conceptual address bits. We do this whenever reasonable.
-;#
-;# A 16x16 transpose can then be thought of as an operation on
-;# a 256-element block of memory. It takes 8 bits 0...7 to address this
-;# memory and the effect of a transpose is to interchange address bit
-;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
-;# column, which is interchanged with the row addressed by bits 4..7.
-;#
-;# The altivec merge instructions provide a rapid means of effecting
-;# many of these transforms. They operate at three widths (8,16,32).
-;# Writing V(x) for vector register #x, paired merges permute address
-;# indices as follows.
-;#
-;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
-;#
-;# vmrghb V( x), V( y), V( y + (1<<s))
-;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
-;#
-;# vmrghh V( x), V( y), V( y + (1<<s))
-;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= =1= 2->3 3->(4+d) (4+s)->2:
-;#
-;# vmrghw V( x), V( y), V( y + (1<<s))
-;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# Unfortunately, there is no doubleword merge instruction.
-;# The following sequence uses "vperm" is a substitute.
-;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;# are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
-;#
-;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
-;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;# Except for bits s and d, the other relationships between register
-;# number (= high-order part of address) bits are at the disposal of
-;# the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;# edges together. This requires a single 16x16 transpose, which, in
-;# the above language, amounts to the following permutation of address
-;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
-;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;# Except for the fact that the destination registers get written
-;# before we are done referencing the old contents, the cyclic transform
-;# is effected by
-;#
-;# x = 0; do {
-;# vmrghb V(2x), V(x), V(x+8);
-;# vmrghb V(2x+1), V(x), V(x+8);
-;# } while( ++x < 8);
-;#
-;# For clarity, and because we can afford it, we do this transpose
-;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
-;# leaving the final result in 16 .. 31, as the lower registers are
-;# used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
- vmrghb \A, \X, \Y
- vmrglb \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
- Tpair v16,v17, v0,v8
- Tpair v18,v19, v1,v9
- Tpair v20,v21, v2,v10
- Tpair v22,v23, v3,v11
- Tpair v24,v25, v4,v12
- Tpair v26,v27, v5,v13
- Tpair v28,v29, v6,v14
- Tpair v30,v31, v7,v15
-.endm
-
-.macro t16_odd
- Tpair v0,v1, v16,v24
- Tpair v2,v3, v17,v25
- Tpair v4,v5, v18,v26
- Tpair v6,v7, v19,v27
- Tpair v8,v9, v20,v28
- Tpair v10,v11, v21,v29
- Tpair v12,v13, v22,v30
- Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
- t16_odd
- t16_even
- t16_odd
- t16_even
-.endm
-
-;# Vertical edge filtering requires transposes. For the simple filter,
-;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;# v0 = 0 1 ... 14 15
-;# v1 = 16 17 ... 30 31
-;# v2 = 32 33 ... 47 48
-;# v3 = 49 50 ... 62 63
-;#
-;# In frame-buffer memory, the layout is:
-;#
-;# 0 16 32 48
-;# 1 17 33 49
-;# ...
-;# 15 31 47 63.
-;#
-;# We begin by reading the data 32 bits at a time (using scalar operations)
-;# into a temporary array, reading the rows of the array into vector registers,
-;# with the following layout:
-;#
-;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
-;# v1 = 1 17 33 49 5 21 ... 45 61
-;# v2 = 2 18 ... 46 62
-;# v3 = 3 19 ... 47 63
-;#
-;# From the "address-bit" perspective discussed above, we simply need to
-;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;# In other words, we transpose each of the four 4x4 submatrices.
-;#
-;# This transformation is its own inverse, and we need to perform it
-;# again before writing the pixels back into the frame buffer.
-;#
-;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;# defined above. We think of both groups of 4 registers as having
-;# "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
- ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
-
- vmrghb v4, v0, v1
- vmrglb v5, v0, v1
- vmrghb v6, v2, v3
- vmrglb v7, v2, v3
-
- ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
-
- vmrghh v0, v4, v6
- vmrglh v1, v4, v6
- vmrghh v2, v5, v7
- vmrglh v3, v5, v7
-
- ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
-
- vmrghw v4, v0, v1
- vmrglw v5, v0, v1
- vmrghw v6, v2, v3
- vmrglw v7, v2, v3
-
- ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
-
- vperm v0, v4, v6, \Vlo
- vperm v1, v4, v6, \Vhi
- vperm v2, v5, v7, \Vlo
- vperm v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;# We read 8 columns of data, initially in the following pattern:
-;#
-;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
-;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
-;# ...
-;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;# and wish to convert to:
-;#
-;# (0,0) ... (0,15)
-;# (1,0) ... (1,15)
-;# ...
-;# (7,0) ... (7,15).
-;#
-;# In "address bit" language, we wish to map
-;#
-;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
-;#
-;# This can be accomplished by 4 iterations of the cyclic transform
-;#
-;# I -> (I+1) mod 7;
-;#
-;# each iteration can be realized by (d=0, s=2):
-;#
-;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
-;#
-;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
-;# preserving v8 = sign converter.
-;#
-;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;# result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
- Tpair v10, v11, v0, v4
- Tpair v12, v13, v1, v5
- Tpair v14, v15, v2, v6
- Tpair v16, v17, v3, v7
-.endm
-
-.macro t8x16_even
- Tpair v0, v1, v10, v14
- Tpair v2, v3, v11, v15
- Tpair v4, v5, v12, v16
- Tpair v6, v7, v13, v17
-.endm
-
-.macro transpose8x16_fwd
- t8x16_odd
- t8x16_even
- t8x16_odd
- t8x16_even
-.endm
-
-.macro transpose8x16_inv
- t8x16_odd
- t8x16_even
- t8x16_odd
-.endm
-
-.macro Transpose16x16
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;# into vector register Vreg. Trashes r0
-.macro load_g Vreg, Gptr
- lwz r0, \Gptr
- lvx \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here. if the answer is negative
-;# it will be clamped to 0. orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
- vsububs \RES, \A, \B
- vsububs \TMP, \B, \A
- vor \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
- vsububs \TMP, \A, \B
- vmaxub \RES, \RES, \TMP
- vsububs \TMP, \B, \A
- vmaxub \RES, \RES, \TMP
-.endm
-
-.macro Masks
- ;# build masks
- ;# input is all 8 bit unsigned (0-255). need to
- ;# do abs(vala-valb) > limit. but no need to compare each
- ;# value to the limit. find the max of the absolute differences
- ;# and compare that to the limit.
- ;# First hev
- Abs v14, v13, v2, v3 ;# |P1 - P0|
- max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
-
- vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
-
- ;# Next limit
- max_abs v14, v13, v0, v1 ;# |P3 - P2|
- max_abs v14, v13, v1, v2 ;# |P2 - P1|
- max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
- max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
-
- vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
-
- ;# flimit
- Abs v14, v13, v3, v4 ;# |P0 - Q0|
-
- vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
-
- vor v8, v8, v9 ;# R = true if flimit or limit exceeded
- ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
- ;# build constants
- lvx \FL, 0, \RFL ;# flimit
- lvx \LI, 0, \RLI ;# limit
- lvx \TH, 0, \RTH ;# thresh
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
- ;# setup strides/pointers to be able to access
- ;# all of the data
- add r5, r4, r4 ;# r5 = 2 * stride
- sub r6, r3, r5 ;# r6 -> 2 rows back
- neg r7, r4 ;# r7 = -stride
-
- ;# load 16 pixels worth of data to work on
- sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
- lvx v0, 0, r0 ;# P3 (read only)
- lvx v1, r7, r6 ;# P2
- lvx v2, 0, r6 ;# P1
- lvx v3, r7, r3 ;# P0
- lvx v4, 0, r3 ;# Q0
- lvx v5, r4, r3 ;# Q1
- lvx v6, r5, r3 ;# Q2
- add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
- lvx v7, r4, r0 ;# Q3 (read only)
-.endm
-
-;# Expects
-;# v10 == HEV
-;# v13 == tmp
-;# v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
- vxor \P1, \P1, v11 ;# SP1
- vxor \P0, \P0, v11 ;# SP0
- vxor \Q0, \Q0, v11 ;# SQ0
- vxor \Q1, \Q1, v11 ;# SQ1
-
- vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
-.if \HEV_PRESENT
- vand v13, v13, v10 ;# f &= hev
-.endif
- vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
- vandc v13, v13, v8 ;# f &= mask
-
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v13, v9 ;# f1 = c (f+4)
- vaddsbs v15, v13, v8 ;# f2 = c (f+3)
-
- vsrab v13, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
- vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
- Masks
-
- ;# start the fitering here
- vxor v1, v1, v11 ;# SP2
- vxor v2, v2, v11 ;# SP1
- vxor v3, v3, v11 ;# SP0
- vxor v4, v4, v11 ;# SQ0
- vxor v5, v5, v11 ;# SQ1
- vxor v6, v6, v11 ;# SQ2
-
- ;# add outer taps if we have high edge variance
- vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
-
- vsubsbs v14, v4, v3 ;# SQ0-SP0
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
- vandc v13, v13, v8 ;# f &= mask
- vand v15, v13, v10 ;# f2 = f & hev
-
- ;# save bottom 3 bits so that we round one side +4 and the other +3
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v15, v9 ;# f1 = c (f+4)
- vaddsbs v15, v15, v8 ;# f2 = c (f+3)
-
- vsrab v14, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
- vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
-
- ;# only apply wider filter if not high edge variance
- vandc v13, v13, v10 ;# f &= ~hev
-
- vspltisb v9, 2
- vnor v8, v8, v8
- vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
- vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
- vspltisb v8, 9
-
- ;# roughly 1/7th difference across boundary
- vspltish v10, 7
- vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v8, v13
- vaddshs v14, v14, v9 ;# += 63
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v6, v6, v10 ;# subtract from Q and add to P
- vaddsbs v1, v1, v10
-
- vxor v6, v6, v11
- vxor v1, v1, v11
-
- ;# roughly 2/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v8, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v5, v5, v10 ;# subtract from Q and add to P
- vaddsbs v2, v2, v10
-
- vxor v5, v5, v11
- vxor v2, v2, v11
-
- ;# roughly 3/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v12, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v4, v4, v10 ;# subtract from Q and add to P
- vaddsbs v3, v3, v10
-
- vxor v4, v4, v11
- vxor v3, v3, v11
-.endm
-
-.macro SBFilter
- Masks
-
- common_adjust v3, v4, v2, v5, 1
-
- ;# outer tap adjustments
- vspltisb v8, 1
-
- vaddubm v13, v13, v8 ;# f += 1
- vsrab v13, v13, v8 ;# f >>= 1
-
- vandc v13, v13, v10 ;# f &= ~hev
-
- vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
- vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
-
- vxor v2, v2, v11
- vxor v3, v3, v11
- vxor v4, v4, v11
- vxor v5, v5, v11
-.endm
-
- .align 2
-mbloop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- vp8_mbfilter
-
- stvx v1, r7, r6 ;# P2
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
- stvx v6, r5, r3 ;# Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- SBFilter
-
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
-;# So we can read in an entire mb aligned. However if we want to filter the mb
-;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
-;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
-;# of a waste. So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;# memory to align and order them up. Then they are read in using the
-;# vector register file.
-.macro RLVmb V, R
- lwzux r0, r3, r4
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r3, r4
- stw r0,12(\R)
- lwz r0,-4(r3)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-.macro WLVmb V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r3, r4
- lwz r0, 8(\R)
- stw r0,-4(r3)
- lwz r0, 4(\R)
- stwux r0, r3, r4
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
- sub r3, r3, r4
-
- RLVmb v0, r9
- RLVmb v1, r9
- RLVmb v2, r9
- RLVmb v3, r9
- RLVmb v4, r9
- RLVmb v5, r9
- RLVmb v6, r9
- RLVmb v7, r9
-
- transpose8x16_fwd
-
- build_constants r5, r6, r7, v8, v9, v10
-
- vp8_mbfilter
-
- transpose8x16_inv
-
- add r3, r3, r4
- neg r4, r4
-
- WLVmb v17, r9
- WLVmb v16, r9
- WLVmb v15, r9
- WLVmb v14, r9
- WLVmb v13, r9
- WLVmb v12, r9
- WLVmb v11, r9
- WLVmb v10, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RL V, R, P
- lvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro WL V, R, P
- stvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
- ;# K = |P0-P1| already
- Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
- vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
- vcmpgtub v10, v14, v0
-
- Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
-
- max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
- max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
- max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
-
- vmaxub v14, v14, v4 ;# M = max interior abs diff
- vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
-
- Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
- vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
- vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
-
- ;# replace P1,Q1 w/signed versions
- common_adjust \P0, \Q0, \P1, \Q1, 1
-
- vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
- vsrab v13, v13, v1
- vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
- vsubsbs \Q1, \Q1, v13
- vaddsbs \P1, \P1, v13
-
- vxor \P1, \P1, v11 ;# P1
- vxor \P0, \P0, v11 ;# P0
- vxor \Q0, \Q0, v11 ;# Q0
- vxor \Q1, \Q1, v11 ;# Q1
-.endm
-
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- addi r9, r3, 0
- RL v16, r9, r4
- RL v17, r9, r4
- RL v18, r9, r4
- RL v19, r9, r4
- RL v20, r9, r4
- RL v21, r9, r4
- RL v22, r9, r4
- RL v23, r9, r4
- RL v24, r9, r4
- RL v25, r9, r4
- RL v26, r9, r4
- RL v27, r9, r4
- RL v28, r9, r4
- RL v29, r9, r4
- RL v30, r9, r4
- lvx v31, 0, r9
-
- Transpose16x16
-
- vspltisb v1, 1
-
- build_constants r5, r6, r7, v3, v2, v0
-
- Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
-
- Fil v16, v17, v18, v19, v20, v21, v22, v23
- Fil v20, v21, v22, v23, v24, v25, v26, v27
- Fil v24, v25, v26, v27, v28, v29, v30, v31
-
- Transpose16x16
-
- addi r9, r3, 0
- WL v16, r9, r4
- WL v17, r9, r4
- WL v18, r9, r4
- WL v19, r9, r4
- WL v20, r9, r4
- WL v21, r9, r4
- WL v22, r9, r4
- WL v23, r9, r4
- WL v24, r9, r4
- WL v25, r9, r4
- WL v26, r9, r4
- WL v27, r9, r4
- WL v28, r9, r4
- WL v29, r9, r4
- WL v30, r9, r4
- stvx v31, 0, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
- andi. r7, r3, 8 ;# row origin modulo 16
- add r7, r7, r7 ;# selects selectors
- lis r12, _chromaSelectors@ha
- la r0, _chromaSelectors@l(r12)
- lwzux r0, r7, r0 ;# leave selector addr in r7
-
- lvx \V, 0, r0 ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
- lvx \U, \Offs, r3
- lvx \V, \Offs, r4
- vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
- vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
- vperm \V, \New, \V, \Vmask
- stvx \U, \Offs, r3 ;# Write to frame buffer
- stvx \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
- neg r9, r5 ;# r9 = -1 * stride
- add r8, r9, r9 ;# r8 = -2 * stride
- add r10, r5, r5 ;# r10 = 2 * stride
-
- active_chroma_sel v12
-
- ;# P3, Q3 are read-only; need not save addresses or sibling pels
- add r6, r8, r8 ;# r6 = -4 * stride
- hread_uv v0, v14, v15, r6, v12
- add r6, r10, r5 ;# r6 = 3 * stride
- hread_uv v7, v14, v15, r6, v12
-
- ;# Others are read/write; save addresses and sibling pels
-
- add r6, r8, r9 ;# r6 = -3 * stride
- hread_uv v1, v16, v17, r6, v12
- hread_uv v2, v18, v19, r8, v12
- hread_uv v3, v20, v21, r9, v12
- hread_uv v4, v22, v23, 0, v12
- hread_uv v5, v24, v25, r5, v12
- hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
- load_g \V, 4(r7)
-.endm
-
-.macro vresult_sel V
- load_g \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
- uresult_sel v11
- vresult_sel v12
- hwrite_uv v2, v18, v19, r8, v11, v12
- hwrite_uv v3, v20, v21, r9, v11, v12
- hwrite_uv v4, v22, v23, 0, v11, v12
- hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- vp8_mbfilter
-
- store_chroma_h
-
- hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
- hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- SBFilter
-
- store_chroma_h
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro R V, R
- lwzux r0, r3, r5
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r4, r5
- stw r0,12(\R)
- lwz r0,-4(r4)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-
-.macro W V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r4, r5
- lwz r0, 8(\R)
- stw r0,-4(r4)
- lwz r0, 4(\R)
- stwux r0, r3, r5
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
-.macro chroma_vread R
- sub r3, r3, r5 ;# back up one line for simplicity
- sub r4, r4, r5
-
- R v0, \R
- R v1, \R
- R v2, \R
- R v3, \R
- R v4, \R
- R v5, \R
- R v6, \R
- R v7, \R
-
- transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
- transpose8x16_inv
-
- add r3, r3, r5
- add r4, r4, r5
- neg r5, r5 ;# Write rows back in reverse order
-
- W v17, \R
- W v16, \R
- W v15, \R
- W v14, \R
- W v13, \R
- W v12, \R
- W v11, \R
- W v10, \R
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- vp8_mbfilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- SBFilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
- Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
- vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
-
- ;# preserve unsigned v0 and v3
- common_adjust v1, v2, v0, v3, 0
-
- vxor v1, v1, v11
- vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
- addi r8, 0, 16
- addi r7, r5, 32
-
- lvx v0, 0, r5
- lvx v1, r8, r5
- lvx v2, 0, r7
- lvx v3, r8, r7
-
- lis r12, _B_hihi@ha
- la r0, _B_hihi@l(r12)
- lvx v16, 0, r0
-
- lis r12, _B_lolo@ha
- la r0, _B_lolo@l(r12)
- lvx v17, 0, r0
-
- Transpose4times4x4 v16, v17
- vp8_simple_filter
-
- vxor v0, v0, v11
- vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
-
- Transpose4times4x4 v16, v17
-
- stvx v0, 0, r5
- stvx v1, r8, r5
- stvx v2, 0, r7
- stvx v3, r8, r7
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- neg r5, r4 ;# r5 = -1 * stride
- add r6, r5, r5 ;# r6 = -2 * stride
-
- lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
- lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
- lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
- lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
-
- vp8_simple_filter
-
- stvx v1, r5, r3 ;# store P0
- stvx v2, 0, r3 ;# store Q0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RLV Offs
- stw r0, (\Offs*4)(r5)
- lwzux r0, r7, r4
-.endm
-
-.macro WLV Offs
- lwz r0, (\Offs*4)(r5)
- stwux r0, r7, r4
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- la r5, -96(r1) ;# temporary space for reading in vectors
-
- ;# Store 4 pels at word "Offs" in temp array, then advance r7
- ;# to next row and read another 4 pels from the frame buffer.
-
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r7 ;# read first 4 pels
-
- ;# 16 unaligned word accesses
- RLV 0
- RLV 4
- RLV 8
- RLV 12
- RLV 1
- RLV 5
- RLV 9
- RLV 13
- RLV 2
- RLV 6
- RLV 10
- RLV 14
- RLV 3
- RLV 7
- RLV 11
-
- stw r0, (15*4)(r5) ;# write last 4 pels
-
- simple_vertical
-
- ;# Read temp array, write frame buffer.
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r5 ;# read/write first 4 pels
- stwx r0, 0, r7
-
- WLV 4
- WLV 8
- WLV 12
- WLV 1
- WLV 5
- WLV 9
- WLV 13
- WLV 2
- WLV 6
- WLV 10
- WLV 14
- WLV 3
- WLV 7
- WLV 11
- WLV 15
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
-_chromaSelectors:
- .long _B_hihi
- .long _B_Ures0
- .long _B_Vres0
- .long 0
- .long _B_lolo
- .long _B_Ures8
- .long _B_Vres8
- .long 0
-
- .align 4
-_B_Vres8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
-
- .align 4
-_B_Ures8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
-
- .align 4
-_B_lolo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_Vres0:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
- .align 4
-_B_Ures0:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_hihi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/common/ppc/platform_altivec.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl save_platform_context
- .globl restore_platform_context
-
-.macro W V P
- stvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-.macro R V P
- lvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-;# r3 context_ptr
- .align 2
-save_platform_contex:
- W v20, r3
- W v21, r3
- W v22, r3
- W v23, r3
- W v24, r3
- W v25, r3
- W v26, r3
- W v27, r3
- W v28, r3
- W v29, r3
- W v30, r3
- W v31, r3
-
- blr
-
-;# r3 context_ptr
- .align 2
-restore_platform_context:
- R v20, r3
- R v21, r3
- R v22, r3
- R v23, r3
- R v24, r3
- R v25, r3
- R v26, r3
- R v27, r3
- R v28, r3
- R v29, r3
- R v30, r3
- R v31, r3
-
- blr
--- a/vp8/common/ppc/recon_altivec.asm
+++ /dev/null
@@ -1,175 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl recon4b_ppc
- .globl recon2b_ppc
- .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- addi \Pred, \Pred, 16 ;# next pred
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v3 = d8..d15
- addi \Diff, \Diff, 32 ;# next diff
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
- stvx v2, 0, \Dst ;# to dst
- add \Dst, \Dst, \Stride ;# next dst
-.endm
-
- .text
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon4b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v2 = d8..d15
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
- stvx v2, 0, r10 ;# 2 rows to dst from buf
- lwz r0, 0(r10)
-.if \write_first_four_pels
- stw r0, 0(\Dst)
- .else
- stwux r0, \Dst, \Stride
-.endif
- lwz r0, 4(r10)
- stw r0, 4(\Dst)
- lwz r0, 8(r10)
- stwux r0, \Dst, \Stride ;# advance dst to next row
- lwz r0, 12(r10)
- stw r0, 4(\Dst)
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-
-recon2b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- la r10, -48(r1) ;# buf
-
- two_rows_of8 r3, r4, r5, r6, 1
-
- addi r4, r4, 16; ;# next pred
- addi r3, r3, 32; ;# next diff
-
- two_rows_of8 r3, r4, r5, r6, 0
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro get_two_diff_rows
- stw r0, 0(r10)
- lwz r0, 4(r3)
- stw r0, 4(r10)
- lwzu r0, 32(r3)
- stw r0, 8(r10)
- lwz r0, 4(r3)
- stw r0, 12(r10)
- lvx v3, 0, r10
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon_b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
-
- la r10, -48(r1) ;# buf
-
- lwz r0, 0(r4)
- stw r0, 0(r10)
- lwz r0, 16(r4)
- stw r0, 4(r10)
- lwz r0, 32(r4)
- stw r0, 8(r10)
- lwz r0, 48(r4)
- stw r0, 12(r10)
-
- lvx v1, 0, r10; ;# v1 = pred = p0..p15
-
- lwz r0, 0(r3) ;# v3 = d0..d7
-
- get_two_diff_rows
-
- vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
- vaddshs v2, v2, v3; ;# v2 = r0..r7
-
- lwzu r0, 32(r3) ;# v3 = d8..d15
-
- get_two_diff_rows
-
- vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
- vaddshs v3, v3, v1; ;# v3 = r8..r15
-
- vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
- stvx v2, 0, r10; ;# 16 pels to dst from buf
-
- lwz r0, 0(r10)
- stw r0, 0(r5)
- lwz r0, 4(r10)
- stwux r0, r5, r6
- lwz r0, 8(r10)
- stwux r0, r5, r6
- lwz r0, 12(r10)
- stwx r0, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
--- a/vp8/common/ppc/systemdependent.c
+++ /dev/null
@@ -1,167 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
-
-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
-
-extern void (*vp9_post_proc_down_and_across)(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-);
-
-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp9_post_proc_down_and_across_c
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-);
-void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp9_copy_mem16x16;
-extern copy_mem_block_function *vp9_copy_mem8x8;
-extern copy_mem_block_function *vp9_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp9_sixtap_predict_c;
-extern subpixel_predict_function vp9_sixtap_predict8x4_c;
-extern subpixel_predict_function vp9_sixtap_predict8x8_c;
-extern subpixel_predict_function vp9_sixtap_predict16x16_c;
-extern subpixel_predict_function vp9_bilinear_predict4x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x8_c;
-extern subpixel_predict_function vp9_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp9_copy_mem16x16_c;
-extern copy_mem_block_function vp9_copy_mem8x8_c;
-extern copy_mem_block_function vp9_copy_mem8x4_c;
-
-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp9_loop_filter_mbv_c;
-extern loop_filter_block_function vp9_loop_filter_bv_c;
-extern loop_filter_block_function vp9_loop_filter_mbh_c;
-extern loop_filter_block_function vp9_loop_filter_bh_c;
-
-extern loop_filter_block_function vp9_loop_filter_mbvs_c;
-extern loop_filter_block_function vp9_loop_filter_bvs_c;
-extern loop_filter_block_function vp9_loop_filter_mbhs_c;
-extern loop_filter_block_function vp9_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp9_clear_c(void) {
-}
-
-void vp9_machine_specific_config(void) {
- // Pure C:
- vp9_clear_system_state = vp9_clear_c;
- vp9_recon_b = vp9_recon_b_c;
- vp9_recon4b = vp9_recon4b_c;
- vp9_recon2b = vp9_recon2b_c;
-
- vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc;
- vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc;
- vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc;
- vp8_bilinear_predict = bilinear_predict4x4_ppc;
-
- vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc;
- vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc;
- vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc;
- vp9_sixtap_predict = sixtap_predict_ppc;
-
- vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c;
- vp8_short_idct4x4 = short_idct4x4llm_ppc;
- vp8_dc_only_idct = vp8_dc_only_idct_c;
-
- vp8_lf_mbvfull = loop_filter_mbv_ppc;
- vp8_lf_bvfull = loop_filter_bv_ppc;
- vp8_lf_mbhfull = loop_filter_mbh_ppc;
- vp8_lf_bhfull = loop_filter_bh_ppc;
-
- vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
- vp8_lf_bvsimple = loop_filter_bvs_ppc;
- vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
- vp8_lf_bhsimple = loop_filter_bhs_ppc;
-
- vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c;
- vp9_mbpost_proc_down = vp9_mbpost_proc_down_c;
- vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c;
- vp9_plane_add_noise = vp9_plane_add_noise_c;
-
- vp9_copy_mem16x16 = copy_mem16x16_ppc;
- vp9_copy_mem8x8 = vp9_copy_mem8x8_c;
- vp9_copy_mem8x4 = vp9_copy_mem8x4_c;
-
-}
--- a/vp8/common/ppflags.h
+++ /dev/null
@@ -1,38 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PPFLAGS_H
-#define __INC_PPFLAGS_H
-enum {
- VP9D_NOFILTERING = 0,
- VP9D_DEBLOCK = 1 << 0,
- VP9D_DEMACROBLOCK = 1 << 1,
- VP9D_ADDNOISE = 1 << 2,
- VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
- VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
- VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
- VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
- VP9D_DEBUG_DRAW_MV = 1 << 7,
- VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
-};
-
-typedef struct {
- int post_proc_flag;
- int deblocking_level;
- int noise_level;
- int display_ref_frame_flag;
- int display_mb_modes_flag;
- int display_b_modes_flag;
- int display_mv_flag;
-} vp9_ppflags_t;
-
-#endif
--- a/vp8/common/pragmas.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-
-
-#ifdef __INTEL_COMPILER
-#pragma warning(disable:997 1011 170)
-#endif
-#ifdef _MSC_VER
-#pragma warning(disable:4799)
-#endif
--- a/vp8/common/pred_common.c
+++ /dev/null
@@ -1,463 +1,0 @@
-
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/pred_common.h"
-#include "vp8/common/seg_common.h"
-
-// TBD prediction functions for various bitstream signals
-
-// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- int pred_context;
- MODE_INFO *m = xd->mode_info_context;
-
- // Note:
- // The mode info data structure has a one element border above and to the
- // left of the entries correpsonding to real macroblocks.
- // The prediction flags in these dummy entries are initialised to 0.
- switch (pred_id) {
- case PRED_SEG_ID:
- pred_context = (m - 1)->mbmi.seg_id_predicted +
- (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
- break;
-
-
- case PRED_REF:
- pred_context = (m - 1)->mbmi.ref_predicted +
- (m - cm->mode_info_stride)->mbmi.ref_predicted;
- break;
-
- case PRED_COMP:
- // Context based on use of comp pred flag by neighbours
- // pred_context =
- // ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +
- // ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);
-
- // Context based on mode and reference frame
- // if ( m->mbmi.ref_frame == LAST_FRAME )
- // pred_context = 0 + (m->mbmi.mode != ZEROMV);
- // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
- // pred_context = 2 + (m->mbmi.mode != ZEROMV);
- // else
- // pred_context = 4 + (m->mbmi.mode != ZEROMV);
-
- if (m->mbmi.ref_frame == LAST_FRAME)
- pred_context = 0;
- else
- pred_context = 1;
-
- break;
-
- case PRED_MBSKIP:
- pred_context = (m - 1)->mbmi.mb_skip_coeff +
- (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
- break;
-
- case PRED_SWITCHABLE_INTERP:
- {
- int left_in_image = (m - 1)->mbmi.mb_in_image;
- int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
- int left_mode = (m - 1)->mbmi.mode;
- int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
- int left_interp, above_interp;
- if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
- left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
- else
- left_interp = VP9_SWITCHABLE_FILTERS;
- if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
- above_interp = vp9_switchable_interp_map[
- (m - cm->mode_info_stride)->mbmi.interp_filter];
- else
- above_interp = VP9_SWITCHABLE_FILTERS;
-
- if (left_interp == above_interp)
- pred_context = left_interp;
- else if (left_interp == VP9_SWITCHABLE_FILTERS &&
- above_interp != VP9_SWITCHABLE_FILTERS)
- pred_context = above_interp;
- else if (left_interp != VP9_SWITCHABLE_FILTERS &&
- above_interp == VP9_SWITCHABLE_FILTERS)
- pred_context = left_interp;
- else
- pred_context = VP9_SWITCHABLE_FILTERS;
- }
- break;
-
- default:
- // TODO *** add error trap code.
- pred_context = 0;
- break;
- }
-
- return pred_context;
-}
-
-// This function returns a context probability for coding a given
-// prediction signal
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- vp9_prob pred_probability;
- int pred_context;
-
- // Get the appropriate prediction context
- pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
- switch (pred_id) {
- case PRED_SEG_ID:
- pred_probability = cm->segment_pred_probs[pred_context];
- break;
-
- case PRED_REF:
- pred_probability = cm->ref_pred_probs[pred_context];
- break;
-
- case PRED_COMP:
- // In keeping with convention elsewhre the probability returned is
- // the probability of a "0" outcome which in this case means the
- // probability of comp pred off.
- pred_probability = cm->prob_comppred[pred_context];
- break;
-
- case PRED_MBSKIP:
- pred_probability = cm->mbskip_pred_probs[pred_context];
- break;
-
- default:
- // TODO *** add error trap code.
- pred_probability = 128;
- break;
- }
-
- return pred_probability;
-}
-
-// This function returns a context probability ptr for coding a given
-// prediction signal
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- const vp9_prob *pred_probability;
- int pred_context;
-
- // Get the appropriate prediction context
- pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
- switch (pred_id) {
- case PRED_SEG_ID:
- pred_probability = &cm->segment_pred_probs[pred_context];
- break;
-
- case PRED_REF:
- pred_probability = &cm->ref_pred_probs[pred_context];
- break;
-
- case PRED_COMP:
- // In keeping with convention elsewhre the probability returned is
- // the probability of a "0" outcome which in this case means the
- // probability of comp pred off.
- pred_probability = &cm->prob_comppred[pred_context];
- break;
-
- case PRED_MBSKIP:
- pred_probability = &cm->mbskip_pred_probs[pred_context];
- break;
-
- case PRED_SWITCHABLE_INTERP:
- pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
- break;
-
- default:
- // TODO *** add error trap code.
- pred_probability = NULL;
- break;
- }
-
- return pred_probability;
-}
-
-// This function returns the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- unsigned char pred_flag = 0;
-
- switch (pred_id) {
- case PRED_SEG_ID:
- pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;
- break;
-
- case PRED_REF:
- pred_flag = xd->mode_info_context->mbmi.ref_predicted;
- break;
-
- case PRED_MBSKIP:
- pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
- break;
-
- default:
- // TODO *** add error trap code.
- pred_flag = 0;
- break;
- }
-
- return pred_flag;
-}
-
-// This function sets the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
- unsigned char pred_flag) {
-#if CONFIG_SUPERBLOCKS
- const int mis = xd->mode_info_stride;
-#endif
-
- switch (pred_id) {
- case PRED_SEG_ID:
- xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
- if (xd->mb_to_bottom_edge > 0) {
- xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
- }
- }
-#endif
- break;
-
- case PRED_REF:
- xd->mode_info_context->mbmi.ref_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
- if (xd->mb_to_bottom_edge > 0) {
- xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
- }
- }
-#endif
- break;
-
- case PRED_MBSKIP:
- xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
- if (xd->mb_to_bottom_edge > 0) {
- xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
- if (xd->mb_to_right_edge > 0)
- xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
- }
- }
-#endif
- break;
-
- default:
- // TODO *** add error trap code.
- break;
- }
-}
-
-
-// The following contain the guts of the prediction code used to
-// peredict various bitstream signals.
-
-// Macroblock segment id prediction function
-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd, int MbIndex) {
- // Currently the prediction for the macroblock segment ID is
- // the value stored for this macroblock in the previous frame.
-#if CONFIG_SUPERBLOCKS
- if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
- return cm->last_frame_seg_map[MbIndex];
-#if CONFIG_SUPERBLOCKS
- } else {
- int seg_id = cm->last_frame_seg_map[MbIndex];
- int mb_col = MbIndex % cm->mb_cols;
- int mb_row = MbIndex / cm->mb_cols;
- if (mb_col + 1 < cm->mb_cols)
- seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
- if (mb_row + 1 < cm->mb_rows) {
- seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
- if (mb_col + 1 < cm->mb_cols)
- seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
- }
- return seg_id;
- }
-#endif
-}
-
-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd) {
- MODE_INFO *m = xd->mode_info_context;
-
- MV_REFERENCE_FRAME left;
- MV_REFERENCE_FRAME above;
- MV_REFERENCE_FRAME above_left;
- MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
-
- int segment_id = xd->mode_info_context->mbmi.segment_id;
- int seg_ref_active;
- int i;
-
- unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
- unsigned char ref_score[MAX_REF_FRAMES];
- unsigned char best_score = 0;
- unsigned char left_in_image;
- unsigned char above_in_image;
- unsigned char above_left_in_image;
-
- // Is segment coding ennabled
- seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
- // Special case treatment if segment coding is enabled.
- // Dont allow prediction of a reference frame that the segment
- // does not allow
- if (seg_ref_active) {
- for (i = 0; i < MAX_REF_FRAMES; i++) {
- frame_allowed[i] =
- vp9_check_segref(xd, segment_id, i);
-
- // Score set to 0 if ref frame not allowed
- ref_score[i] = cm->ref_scores[i] * frame_allowed[i];
- }
- } else
- vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));
-
- // Reference frames used by neighbours
- left = (m - 1)->mbmi.ref_frame;
- above = (m - cm->mode_info_stride)->mbmi.ref_frame;
- above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
-
- // Are neighbours in image
- left_in_image = (m - 1)->mbmi.mb_in_image;
- above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
- above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;
-
- // Adjust scores for candidate reference frames based on neigbours
- if (frame_allowed[left] && left_in_image) {
- ref_score[left] += 16;
- if (above_left_in_image && (left == above_left))
- ref_score[left] += 4;
- }
- if (frame_allowed[above] && above_in_image) {
- ref_score[above] += 16;
- if (above_left_in_image && (above == above_left))
- ref_score[above] += 4;
- }
-
- // Now choose the candidate with the highest score
- for (i = 0; i < MAX_REF_FRAMES; i++) {
- if (ref_score[i] > best_score) {
- pred_ref = i;
- best_score = ref_score[i];
- }
- }
-
- return pred_ref;
-}
-
-// Functions to computes a set of modified reference frame probabilities
-// to use when the prediction of the reference frame value fails
-void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
- int tot_count;
-
- tot_count = count[0] + count[1] + count[2] + count[3];
- if (tot_count) {
- probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
- probs[0] += !probs[0];
- } else
- probs[0] = 128;
-
- tot_count -= count[0];
- if (tot_count) {
- probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
- probs[1] += !probs[1];
- } else
- probs[1] = 128;
-
- tot_count -= count[1];
- if (tot_count) {
- probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
- probs[2] += !probs[2];
- } else
- probs[2] = 128;
-
-}
-
-// Computes a set of modified conditional probabilities for the reference frame
-// Values willbe set to 0 for reference frame options that are not possible
-// because wither they were predicted and prediction has failed or because
-// they are not allowed for a given segment.
-void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
- int norm_cnt[MAX_REF_FRAMES];
- int intra_count;
- int inter_count;
- int last_count;
- int gfarf_count;
- int gf_count;
- int arf_count;
-
- intra_count = cm->prob_intra_coded;
- inter_count = (255 - intra_count);
- last_count = (inter_count * cm->prob_last_coded) / 255;
- gfarf_count = inter_count - last_count;
- gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
- arf_count = gfarf_count - gf_count;
-
- // Work out modified reference frame probabilities to use where prediction
- // of the reference frame fails
- norm_cnt[0] = 0;
- norm_cnt[1] = last_count;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);
- cm->mod_refprobs[INTRA_FRAME][0] = 0; // This branch implicit
-
- norm_cnt[0] = intra_count;
- norm_cnt[1] = 0;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);
- cm->mod_refprobs[LAST_FRAME][1] = 0; // This branch implicit
-
- norm_cnt[0] = intra_count;
- norm_cnt[1] = last_count;
- norm_cnt[2] = 0;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);
- cm->mod_refprobs[GOLDEN_FRAME][2] = 0; // This branch implicit
-
- norm_cnt[0] = intra_count;
- norm_cnt[1] = last_count;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = 0;
- vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);
- cm->mod_refprobs[ALTREF_FRAME][2] = 0; // This branch implicit
-
- // Score the reference frames based on overal frequency.
- // These scores contribute to the prediction choices.
- // Max score 17 min 1
- cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);
- cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);
- cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);
- cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);
-}
--- a/vp8/common/pred_common.h
+++ /dev/null
@@ -1,56 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "type_aliases.h"
-#include "onyxc_int.h"
-#include "vp8/common/blockd.h"
-
-#ifndef __INC_PRED_COMMON_H__
-#define __INC_PRED_COMMON_H__ 1
-
-
-// Predicted items
-typedef enum {
- PRED_SEG_ID = 0, // Segment identifier
- PRED_REF = 1,
- PRED_COMP = 2,
- PRED_MBSKIP = 3,
- PRED_SWITCHABLE_INTERP = 4
-} PRED_ID;
-
-extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-extern void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
- unsigned char pred_flag);
-
-
-extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- int MbIndex);
-
-extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd);
-extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
-
-#endif /* __INC_PRED_COMMON_H__ */
--- a/vp8/common/quant_common.c
+++ /dev/null
@@ -1,125 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "quant_common.h"
-
-static int dc_qlookup[QINDEX_RANGE];
-static int ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 4
-
-void vp9_init_quant_tables() {
- int i;
- int current_val = 4;
- int last_val = 4;
- int ac_val;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- ac_qlookup[i] = current_val;
- current_val = (int)((double)current_val * 1.02);
- if (current_val == last_val)
- current_val++;
- last_val = current_val;
-
- ac_val = ac_qlookup[i];
- dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +
- (-0.00065 * ac_val * ac_val) +
- (0.9 * ac_val) + 0.5;
- if (dc_qlookup[i] < ACDC_MIN)
- dc_qlookup[i] = ACDC_MIN;
- }
-}
-
-int vp9_dc_quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = dc_qlookup[ QIndex ];
- return retval;
-}
-
-int vp9_dc2quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = dc_qlookup[ QIndex ];
-
- return retval;
-
-}
-int vp9_dc_uv_quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = dc_qlookup[ QIndex ];
-
- return retval;
-}
-
-int vp9_ac_yquant(int QIndex) {
- int retval;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = ac_qlookup[ QIndex ];
- return retval;
-}
-
-int vp9_ac2quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = (ac_qlookup[ QIndex ] * 775) / 1000;
- if (retval < 4)
- retval = 4;
-
- return retval;
-}
-int vp9_ac_uv_quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = ac_qlookup[ QIndex ];
- return retval;
-}
--- a/vp8/common/quant_common.h
+++ /dev/null
@@ -1,22 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "string.h"
-#include "blockd.h"
-#include "onyxc_int.h"
-
-extern void vp9_init_quant_tables();
-extern int vp9_ac_yquant(int QIndex);
-extern int vp9_dc_quant(int QIndex, int Delta);
-extern int vp9_dc2quant(int QIndex, int Delta);
-extern int vp9_ac2quant(int QIndex, int Delta);
-extern int vp9_dc_uv_quant(int QIndex, int Delta);
-extern int vp9_ac_uv_quant(int QIndex, int Delta);
--- a/vp8/common/recon.c
+++ /dev/null
@@ -1,197 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "blockd.h"
-
-void vp9_recon_b_c
-(
- unsigned char *pred_ptr,
- short *diff_ptr,
- unsigned char *dst_ptr,
- int stride
-) {
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- diff_ptr += 16;
- pred_ptr += 16;
- }
-}
-
-void vp9_recon_uv_b_c
-(
- unsigned char *pred_ptr,
- short *diff_ptr,
- unsigned char *dst_ptr,
- int stride
-) {
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- diff_ptr += 8;
- pred_ptr += 8;
- }
-}
-void vp9_recon4b_c
-(
- unsigned char *pred_ptr,
- short *diff_ptr,
- unsigned char *dst_ptr,
- int stride
-) {
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 16; c++) {
- int a = diff_ptr[c] + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- diff_ptr += 16;
- pred_ptr += 16;
- }
-}
-
-void vp9_recon2b_c
-(
- unsigned char *pred_ptr,
- short *diff_ptr,
- unsigned char *dst_ptr,
- int stride
-) {
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 8; c++) {
- int a = diff_ptr[c] + pred_ptr[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dst_ptr[c] = (unsigned char) a;
- }
-
- dst_ptr += stride;
- diff_ptr += 8;
- pred_ptr += 8;
- }
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
- int x, y;
- BLOCKD *b = &xd->block[0];
- int stride = b->dst_stride;
- short *diff = b->diff;
-
- for (y = 0; y < 16; y++) {
- for (x = 0; x < 16; x++) {
- int a = dst[x] + diff[x];
- if (a < 0)
- a = 0;
- else if (a > 255)
- a = 255;
- dst[x] = a;
- }
- dst += stride;
- diff += 16;
- }
-}
-
-void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
- int x, y, i;
- uint8_t *dst = udst;
-
- for (i = 0; i < 2; i++, dst = vdst) {
- BLOCKD *b = &xd->block[16 + 4 * i];
- int stride = b->dst_stride;
- short *diff = b->diff;
-
- for (y = 0; y < 8; y++) {
- for (x = 0; x < 8; x++) {
- int a = dst[x] + diff[x];
- if (a < 0)
- a = 0;
- else if (a > 255)
- a = 255;
- dst[x] = a;
- }
- dst += stride;
- diff += 8;
- }
- }
-}
-#endif
-
-void vp9_recon_mby_c(MACROBLOCKD *xd) {
- int i;
-
- for (i = 0; i < 16; i += 4) {
- BLOCKD *b = &xd->block[i];
-
- vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
- }
-}
-
-void vp9_recon_mb_c(MACROBLOCKD *xd) {
- int i;
-
- for (i = 0; i < 16; i += 4) {
- BLOCKD *b = &xd->block[i];
-
- vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
- }
-
- for (i = 16; i < 24; i += 2) {
- BLOCKD *b = &xd->block[i];
-
- vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
- }
-}
--- a/vp8/common/reconinter.c
+++ /dev/null
@@ -1,1145 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx/vpx_integer.h"
-#include "subpixel.h"
-#include "blockd.h"
-#include "reconinter.h"
-#if CONFIG_RUNTIME_CPU_DETECT
-#include "onyxc_int.h"
-#endif
-
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE mcomp_filter_type,
- VP9_COMMON *cm) {
- if (mcomp_filter_type == SIXTAP) {
- xd->subpixel_predict = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap4x4);
- xd->subpixel_predict8x4 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap8x4);
- xd->subpixel_predict8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap8x8);
- xd->subpixel_predict16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap16x16);
- xd->subpixel_predict_avg = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap_avg4x4);
- xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap_avg8x8);
- xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, sixtap_avg16x16);
- } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
- xd->subpixel_predict = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap4x4);
- xd->subpixel_predict8x4 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap8x4);
- xd->subpixel_predict8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap8x8);
- xd->subpixel_predict16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap16x16);
- xd->subpixel_predict_avg = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg4x4);
- xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg8x8);
- xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg16x16);
- } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
- xd->subpixel_predict = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap4x4_sharp);
- xd->subpixel_predict8x4 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap8x4_sharp);
- xd->subpixel_predict8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap8x8_sharp);
- xd->subpixel_predict16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap16x16_sharp);
- xd->subpixel_predict_avg = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg4x4_sharp);
- xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg8x8_sharp);
- xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, eighttap_avg16x16_sharp);
- }
- else {
- xd->subpixel_predict = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear4x4);
- xd->subpixel_predict8x4 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear8x4);
- xd->subpixel_predict8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear8x8);
- xd->subpixel_predict16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear16x16);
- xd->subpixel_predict_avg = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear_avg4x4);
- xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear_avg8x8);
- xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
- &cm->rtcd.subpix, bilinear_avg16x16);
- }
-}
-
-void vp9_copy_mem16x16_c(unsigned char *src,
- int src_stride,
- unsigned char *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
- dst[8] = src[8];
- dst[9] = src[9];
- dst[10] = src[10];
- dst[11] = src[11];
- dst[12] = src[12];
- dst[13] = src[13];
- dst[14] = src[14];
- dst[15] = src[15];
-
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
- ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
- ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
-
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_avg_mem16x16_c(unsigned char *src,
- int src_stride,
- unsigned char *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
- int n;
-
- for (n = 0; n < 16; n++) {
- dst[n] = (dst[n] + src[n] + 1) >> 1;
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x8_c(unsigned char *src,
- int src_stride,
- unsigned char *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_avg_mem8x8_c(unsigned char *src,
- int src_stride,
- unsigned char *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
- int n;
-
- for (n = 0; n < 8; n++) {
- dst[n] = (dst[n] + src[n] + 1) >> 1;
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x4_c(unsigned char *src,
- int src_stride,
- unsigned char *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
- int r;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- int_mv mv;
-
- ptr_base = *(d->base_pre);
- mv.as_int = d->bmi.as_mv.first.as_int;
-
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
- pred_ptr, pitch);
- } else {
- ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- ptr = ptr_base;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- pred_ptr[0] = ptr[0];
- pred_ptr[1] = ptr[1];
- pred_ptr[2] = ptr[2];
- pred_ptr[3] = ptr[3];
-#else
- *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
-#endif
- pred_ptr += pitch;
- ptr += d->pre_stride;
- }
- }
-}
-
-/*
- * Similar to vp9_build_inter_predictors_b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to vp9_build_inter_predictors_b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf) {
- int r;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- int_mv mv;
-
- ptr_base = *(d->base_second_pre);
- mv.as_int = d->bmi.as_mv.second.as_int;
-
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
- pred_ptr, pitch);
- } else {
- ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- ptr = ptr_base;
-
- for (r = 0; r < 4; r++) {
- pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1;
- pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1;
- pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1;
- pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1;
- pred_ptr += pitch;
- ptr += d->pre_stride;
- }
- }
-}
-
-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- int_mv mv;
-
- ptr_base = *(d->base_pre);
- mv.as_int = d->bmi.as_mv.first.as_int;
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
-
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
- }
-}
-
-/*
- * Similar to build_inter_predictors_4b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to build_inter_predictors_4b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
- BLOCKD *d, int pitch) {
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- int_mv mv;
-
- ptr_base = *(d->base_second_pre);
- mv.as_int = d->bmi.as_mv.second.as_int;
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
-
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
- }
-}
-
-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- int_mv mv;
-
- ptr_base = *(d->base_pre);
- mv.as_int = d->bmi.as_mv.first.as_int;
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
-
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
- }
-}
-
-
-/*encoder only*/
-#if CONFIG_PRED_FILTER
-
-// Select the thresholded or non-thresholded filter
-#define USE_THRESH_FILTER 0
-
-#define PRED_FILT_LEN 5
-
-static const int filt_shift = 4;
-static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
-// Alternative filter {1, 1, 4, 1, 1}
-
-#if !USE_THRESH_FILTER
-void filter_mb(unsigned char *src, int src_stride,
- unsigned char *dst, int dst_stride,
- int width, int height) {
- int i, j, k;
- unsigned int Temp[32 * 32];
- unsigned int *pTmp = Temp;
- unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
-
- // Horizontal
- for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
- for (j = 0; j < width; j++) {
- int sum = 0;
- for (k = 0; k < PRED_FILT_LEN; k++)
- sum += pSrc[j + k] * pred_filter[k];
- pTmp[j] = sum;
- }
-
- pSrc += src_stride;
- pTmp += width;
- }
-
- // Vertical
- pTmp = Temp;
- for (i = 0; i < width; i++) {
- unsigned char *pDst = dst + i;
- for (j = 0; j < height; j++) {
- int sum = 0;
- for (k = 0; k < PRED_FILT_LEN; k++)
- sum += pTmp[(j + k) * width] * pred_filter[k];
- // Round
- sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
- pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
- }
- ++pTmp;
- }
-}
-#else
-// Based on vp9_post_proc_down_and_across_c (postproc.c)
-void filter_mb(unsigned char *src, int src_stride,
- unsigned char *dst, int dst_stride,
- int width, int height) {
- unsigned char *pSrc, *pDst;
- int row;
- int col;
- int i;
- int v;
- unsigned char d[8];
-
- /* TODO flimit should be linked to the quantizer value */
- int flimit = 7;
-
- for (row = 0; row < height; row++) {
- /* post_proc_down for one row */
- pSrc = src;
- pDst = dst;
-
- for (col = 0; col < width; col++) {
- int kernel = (1 << (filt_shift - 1));
- int v = pSrc[col];
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - pSrc[col + i * src_stride]) > flimit)
- goto down_skip_convolve;
-
- kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
- }
-
- v = (kernel >> filt_shift);
- down_skip_convolve:
- pDst[col] = v;
- }
-
- /* now post_proc_across */
- pSrc = dst;
- pDst = dst;
-
- for (i = 0; i < 8; i++)
- d[i] = pSrc[i];
-
- for (col = 0; col < width; col++) {
- int kernel = (1 << (filt_shift - 1));
- v = pSrc[col];
-
- d[col & 7] = v;
-
- for (i = -2; i <= 2; i++) {
- if (abs(v - pSrc[col + i]) > flimit)
- goto across_skip_convolve;
-
- kernel += pred_filter[2 + i] * pSrc[col + i];
- }
-
- d[col & 7] = (kernel >> filt_shift);
- across_skip_convolve:
-
- if (col >= 2)
- pDst[col - 2] = d[(col - 2) & 7];
- }
-
- /* handle the last two pixels */
- pDst[col - 2] = d[(col - 2) & 7];
- pDst[col - 1] = d[(col - 1) & 7];
-
- /* next row */
- src += src_stride;
- dst += dst_stride;
- }
-}
-#endif // !USE_THRESH_FILTER
-
-#endif // CONFIG_PRED_FILTER
-
-/*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
- int i, j;
- BLOCKD *blockd = xd->block;
-
- /* build uv mvs */
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- int yoffset = i * 8 + j * 2;
- int uoffset = 16 + i * 2 + j;
- int voffset = 20 + i * 2 + j;
- int temp;
-
- temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row
- + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
- + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
- + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
-
- if (temp < 0) temp -= 4;
- else temp += 4;
-
- xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
- xd->fullpixel_mask;
-
- temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col
- + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
- + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
- + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
-
- if (temp < 0) temp -= 4;
- else temp += 4;
-
- blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
- xd->fullpixel_mask;
-
- blockd[voffset].bmi.as_mv.first.as_mv.row =
- blockd[uoffset].bmi.as_mv.first.as_mv.row;
- blockd[voffset].bmi.as_mv.first.as_mv.col =
- blockd[uoffset].bmi.as_mv.first.as_mv.col;
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row
- + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
- + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
- + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
-
- if (temp < 0) {
- temp -= 4;
- } else {
- temp += 4;
- }
-
- blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
- xd->fullpixel_mask;
-
- temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col
- + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
- + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
- + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
-
- if (temp < 0) {
- temp -= 4;
- } else {
- temp += 4;
- }
-
- blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
- xd->fullpixel_mask;
-
- blockd[voffset].bmi.as_mv.second.as_mv.row =
- blockd[uoffset].bmi.as_mv.second.as_mv.row;
- blockd[voffset].bmi.as_mv.second.as_mv.col =
- blockd[uoffset].bmi.as_mv.second.as_mv.col;
- }
- }
- }
-
- for (i = 16; i < 24; i += 2) {
- BLOCKD *d0 = &blockd[i];
- BLOCKD *d1 = &blockd[i + 1];
-
- if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
- build_inter_predictors2b(xd, d0, 8);
- else {
- vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
- vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
- }
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
- vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
- }
- }
-}
-
-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
- /* If the MV points so far into the UMV border that no visible pixels
- * are used for reconstruction, the subpel part of the MV can be
- * discarded and the MV limited to 16 pixels with equivalent results.
- *
- * This limit kicks in at 19 pixels for the top and left edges, for
- * the 16 pixels plus 3 taps right of the central pixel when subpel
- * filtering. The bottom and right edges use 16 pixels plus 2 pixels
- * left of the central pixel when filtering.
- */
- if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))
- mv->col = xd->mb_to_left_edge - (16 << 3);
- else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))
- mv->col = xd->mb_to_right_edge + (16 << 3);
-
- if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))
- mv->row = xd->mb_to_top_edge - (16 << 3);
- else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))
- mv->row = xd->mb_to_bottom_edge + (16 << 3);
-}
-
-/* A version of the above function for chroma block MVs.*/
-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
- mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?
- (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
- mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?
- (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
-
- mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?
- (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
- mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?
- (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
-}
-
-/*encoder only*/
-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
- unsigned char *dst_y,
- int dst_ystride,
- int clamp_mvs) {
- unsigned char *ptr_base = xd->pre.y_buffer;
- unsigned char *ptr;
- int pre_stride = xd->block[0].pre_stride;
- int_mv ymv;
-
- ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
- if (clamp_mvs)
- clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
- ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
-
-#if CONFIG_PRED_FILTER
- if (xd->mode_info_context->mbmi.pred_filter_enabled) {
- if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
- // Sub-pel filter needs extended input
- int len = 15 + (INTERP_EXTEND << 1);
- unsigned char Temp[32 * 32]; // Data required by sub-pel filter
- unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
- // Copy extended MB into Temp array, applying the spatial filter
- filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
- Temp, len, len, len);
-
- // Sub-pel interpolation
- xd->subpixel_predict16x16(pTemp, len,
- (ymv.as_mv.col & 7) << 1,
- (ymv.as_mv.row & 7) << 1,
- dst_y, dst_ystride);
- } else {
- // Apply spatial filter to create the prediction directly
- filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
- }
- } else
-#endif
- if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
- xd->subpixel_predict16x16(ptr, pre_stride,
- (ymv.as_mv.col & 7) << 1,
- (ymv.as_mv.row & 7) << 1,
- dst_y, dst_ystride);
- } else {
- vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
- }
-}
-
-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_uvstride) {
- int offset;
- unsigned char *uptr, *vptr;
- int pre_stride = xd->block[0].pre_stride;
- int_mv _o16x16mv;
- int_mv _16x16mv;
-
- _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
- if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
- clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
- _o16x16mv = _16x16mv;
- /* calc uv motion vectors */
- if (_16x16mv.as_mv.row < 0)
- _16x16mv.as_mv.row -= 1;
- else
- _16x16mv.as_mv.row += 1;
-
- if (_16x16mv.as_mv.col < 0)
- _16x16mv.as_mv.col -= 1;
- else
- _16x16mv.as_mv.col += 1;
-
- _16x16mv.as_mv.row /= 2;
- _16x16mv.as_mv.col /= 2;
-
- _16x16mv.as_mv.row &= xd->fullpixel_mask;
- _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
- pre_stride >>= 1;
- offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
- uptr = xd->pre.u_buffer + offset;
- vptr = xd->pre.v_buffer + offset;
-
-#if CONFIG_PRED_FILTER
- if (xd->mode_info_context->mbmi.pred_filter_enabled) {
- int i;
- unsigned char *pSrc = uptr;
- unsigned char *pDst = dst_u;
- int len = 7 + (INTERP_EXTEND << 1);
- unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
- unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
- // U & V
- for (i = 0; i < 2; i++) {
- if (_o16x16mv.as_int & 0x000f000f) {
- // Copy extended MB into Temp array, applying the spatial filter
- filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
- Temp, len, len, len);
-
- // Sub-pel filter
- xd->subpixel_predict8x8(pTemp, len,
- _o16x16mv.as_mv.col & 15,
- _o16x16mv.as_mv.row & 15,
- pDst, dst_uvstride);
- } else {
- filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
- }
-
- // V
- pSrc = vptr;
- pDst = dst_v;
- }
- } else
-#endif
- if (_o16x16mv.as_int & 0x000f000f) {
- xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
- _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
- xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
- _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
- } else {
- vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
- vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
- }
-}
-
-
-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride, int dst_uvstride) {
- vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
- xd->mode_info_context->mbmi.need_to_clamp_mvs);
- vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride,
- int dst_uvstride) {
- uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
- uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
- *v2 = x->second_pre.v_buffer;
- int n;
-
- for (n = 0; n < 4; n++)
- {
- const int x_idx = n & 1, y_idx = n >> 1;
-
- x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride + x_idx * 16;
- x->pre.u_buffer = u1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
- x->pre.v_buffer = v1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
-
- vp9_build_1st_inter16x16_predictors_mb(x,
- dst_y + y_idx * 16 * dst_ystride + x_idx * 16,
- dst_u + y_idx * 8 * dst_uvstride + x_idx * 8,
- dst_v + y_idx * 8 * dst_uvstride + x_idx * 8,
- dst_ystride, dst_uvstride);
- if (x->mode_info_context->mbmi.second_ref_frame) {
- x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride + x_idx * 16;
- x->second_pre.u_buffer = u2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
- x->second_pre.v_buffer = v2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
-
- vp9_build_2nd_inter16x16_predictors_mb(x,
- dst_y + y_idx * 16 * dst_ystride + x_idx * 16,
- dst_u + y_idx * 8 * dst_uvstride + x_idx * 8,
- dst_v + y_idx * 8 * dst_uvstride + x_idx * 8,
- dst_ystride, dst_uvstride);
- }
- }
-
- x->pre.y_buffer = y1;
- x->pre.u_buffer = u1;
- x->pre.v_buffer = v1;
-
- if (x->mode_info_context->mbmi.second_ref_frame) {
- x->second_pre.y_buffer = y2;
- x->second_pre.u_buffer = u2;
- x->second_pre.v_buffer = v2;
- }
-}
-#endif
-
-/*
- * The following functions should be called after an initial
- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second sixtap filter on a (different) ref
- * frame and average the result with the output of the
- * first sixtap filter. The second reference frame is stored
- * in x->second_pre (the reference frame index is in
- * x->mode_info_context->mbmi.second_ref_frame). The second
- * motion vector is x->mode_info_context->mbmi.second_mv.
- *
- * This allows blending prediction from two reference frames
- * which sometimes leads to better prediction than from a
- * single reference framer.
- */
-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
- unsigned char *dst_y,
- int dst_ystride) {
- unsigned char *ptr;
-
- int_mv _16x16mv;
- int mv_row;
- int mv_col;
-
- unsigned char *ptr_base = xd->second_pre.y_buffer;
- int pre_stride = xd->block[0].pre_stride;
-
- _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
- if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
- clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
- mv_row = _16x16mv.as_mv.row;
- mv_col = _16x16mv.as_mv.col;
-
- ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
-#if CONFIG_PRED_FILTER
- if (xd->mode_info_context->mbmi.pred_filter_enabled) {
- if ((mv_row | mv_col) & 7) {
- // Sub-pel filter needs extended input
- int len = 15 + (INTERP_EXTEND << 1);
- unsigned char Temp[32 * 32]; // Data required by sub-pel filter
- unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
-
- // Copy extended MB into Temp array, applying the spatial filter
- filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
- Temp, len, len, len);
-
- // Sub-pel filter
- xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
- (mv_row & 7) << 1, dst_y, dst_ystride);
- } else {
- // TODO Needs to AVERAGE with the dst_y
- // For now, do not apply the prediction filter in these cases!
- vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
- }
- } else
-#endif // CONFIG_PRED_FILTER
- {
- if ((mv_row | mv_col) & 7) {
- xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
- (mv_row & 7) << 1, dst_y, dst_ystride);
- } else {
- vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
- }
- }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_uvstride) {
- int offset;
- unsigned char *uptr, *vptr;
-
- int_mv _16x16mv;
- int mv_row;
- int mv_col;
- int omv_row, omv_col;
-
- int pre_stride = xd->block[0].pre_stride;
-
- _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
- if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
- clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
- mv_row = _16x16mv.as_mv.row;
- mv_col = _16x16mv.as_mv.col;
-
- /* calc uv motion vectors */
- omv_row = mv_row;
- omv_col = mv_col;
- mv_row = (mv_row + (mv_row > 0)) >> 1;
- mv_col = (mv_col + (mv_col > 0)) >> 1;
-
- mv_row &= xd->fullpixel_mask;
- mv_col &= xd->fullpixel_mask;
-
- pre_stride >>= 1;
- offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
- uptr = xd->second_pre.u_buffer + offset;
- vptr = xd->second_pre.v_buffer + offset;
-
-#if CONFIG_PRED_FILTER
- if (xd->mode_info_context->mbmi.pred_filter_enabled) {
- int i;
- int len = 7 + (INTERP_EXTEND << 1);
- unsigned char Temp[32 * 32]; // Data required by sub-pel filter
- unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
- unsigned char *pSrc = uptr;
- unsigned char *pDst = dst_u;
-
- // U & V
- for (i = 0; i < 2; i++) {
- if ((omv_row | omv_col) & 15) {
- // Copy extended MB into Temp array, applying the spatial filter
- filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
- Temp, len, len, len);
-
- // Sub-pel filter
- xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
- omv_row & 15, pDst, dst_uvstride);
- } else {
- // TODO Needs to AVERAGE with the dst_[u|v]
- // For now, do not apply the prediction filter here!
- vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
- }
-
- // V
- pSrc = vptr;
- pDst = dst_v;
- }
- } else
-#endif // CONFIG_PRED_FILTER
- if ((omv_row | omv_col) & 15) {
- xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
- omv_row & 15, dst_u, dst_uvstride);
- xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
- omv_row & 15, dst_v, dst_uvstride);
- } else {
- vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
- vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
- }
-}
-
-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride,
- int dst_uvstride) {
- vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
- vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
- int i;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
- BLOCKD *blockd = xd->block;
-
- if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
- blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
- blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
- blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
- blockd[10].bmi = xd->mode_info_context->bmi[10];
-
- if (mbmi->need_to_clamp_mvs) {
- clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
- if (mbmi->second_ref_frame) {
- clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
- }
- }
-
-
- vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
- vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
- vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
- vp9_build_inter_predictors4b(xd, &blockd[10], 16);
-
- if (mbmi->second_ref_frame) {
- vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
- vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
- vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
- vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
- }
- } else {
- for (i = 0; i < 16; i += 2) {
- BLOCKD *d0 = &blockd[i];
- BLOCKD *d1 = &blockd[i + 1];
-
- blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
- blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
- if (mbmi->need_to_clamp_mvs) {
- clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
- if (mbmi->second_ref_frame) {
- clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
- clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
- }
- }
-
- if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
- build_inter_predictors2b(xd, d0, 16);
- else {
- vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
- vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
- }
-
- if (mbmi->second_ref_frame) {
- vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
- vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
- }
- }
- }
-
- for (i = 16; i < 24; i += 2) {
- BLOCKD *d0 = &blockd[i];
- BLOCKD *d1 = &blockd[i + 1];
-
- if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
- build_inter_predictors2b(xd, d0, 8);
- else {
- vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
- vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
- }
-
- if (mbmi->second_ref_frame) {
- vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
- vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
- }
- }
-}
-
-static
-void build_4x4uvmvs(MACROBLOCKD *xd) {
- int i, j;
- BLOCKD *blockd = xd->block;
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- int yoffset = i * 8 + j * 2;
- int uoffset = 16 + i * 2 + j;
- int voffset = 20 + i * 2 + j;
-
- int temp;
-
- temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
-
- if (temp < 0) temp -= 4;
- else temp += 4;
-
- blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
- xd->fullpixel_mask;
-
- temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
-
- if (temp < 0) temp -= 4;
- else temp += 4;
-
- blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
- xd->fullpixel_mask;
-
- // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
- clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
-
- // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
- clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
-
- blockd[voffset].bmi.as_mv.first.as_mv.row =
- blockd[uoffset].bmi.as_mv.first.as_mv.row;
- blockd[voffset].bmi.as_mv.first.as_mv.col =
- blockd[uoffset].bmi.as_mv.first.as_mv.col;
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
- + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
-
- if (temp < 0) {
- temp -= 4;
- } else {
- temp += 4;
- }
-
- blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
- xd->fullpixel_mask;
-
- temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
- + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
-
- if (temp < 0) {
- temp -= 4;
- } else {
- temp += 4;
- }
-
- blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
- xd->fullpixel_mask;
-
- // if (mbmi->need_to_clamp_mvs)
- clamp_uvmv_to_umv_border(
- &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
-
- // if (mbmi->need_to_clamp_mvs)
- clamp_uvmv_to_umv_border(
- &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
-
- blockd[voffset].bmi.as_mv.second.as_mv.row =
- blockd[uoffset].bmi.as_mv.second.as_mv.row;
- blockd[voffset].bmi.as_mv.second.as_mv.col =
- blockd[uoffset].bmi.as_mv.second.as_mv.col;
- }
- }
- }
-}
-
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
- if (xd->mode_info_context->mbmi.mode != SPLITMV) {
- vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
- &xd->predictor[256],
- &xd->predictor[320], 16, 8);
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- /* 256 = offset of U plane in Y+U+V buffer;
- * 320 = offset of V plane in Y+U+V buffer.
- * (256=16x16, 320=16x16+8x8). */
- vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
- &xd->predictor[256],
- &xd->predictor[320], 16, 8);
- }
- } else {
- build_4x4uvmvs(xd);
- build_inter4x4_predictors_mb(xd);
- }
-}
--- a/vp8/common/reconinter.h
+++ /dev/null
@@ -1,78 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_RECONINTER_H
-#define __INC_RECONINTER_H
-
-#include "onyxc_int.h"
-
-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
- unsigned char *dst_y,
- int dst_ystride,
- int clamp_mvs);
-
-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_uvstride);
-
-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride,
- int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
- unsigned char *dst_y,
- int dst_ystride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride,
- int dst_uvstride);
-
-#if CONFIG_SUPERBLOCKS
-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
- unsigned char *dst_y,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_ystride,
- int dst_uvstride);
-#endif
-
-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
-
-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf);
-
-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf);
-
-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
- int pitch);
-
-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
- BLOCKD *d, int pitch);
-
-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
-
-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE filter,
- VP9_COMMON *cm);
-
-#endif // __INC_RECONINTER_H
--- a/vp8/common/reconintra.c
+++ /dev/null
@@ -1,490 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdio.h>
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-
-/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
- * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
- */
-
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c, h, w, v;
- int a, b;
- r = 0;
- for (c = 0; c < n - 2; c++) {
- if (c & 1)
- a = yleft_col[r + 1];
- else
- a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
- b = yabove_row[c + 2];
- ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
- }
- for (r = 1; r < n / 2 - 1; r++) {
- for (c = 0; c < n - 2 - 2 * r; c++) {
- if (c & 1)
- a = yleft_col[r + 1];
- else
- a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
- b = ypred_ptr[(r - 1) * y_stride + c + 2];
- ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
- }
- }
- for (; r < n - 1; ++r) {
- for (c = 0; c < n; c++) {
- v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);
- h = r - c / 2;
- ypred_ptr[h * y_stride + c] = v;
- }
- }
- c = 0;
- r = n - 1;
- ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +
- yleft_col[r] + 1) >> 1;
- for (r = n - 2; r >= n / 2; --r) {
- w = c + (n - 1 - r) * 2;
- ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
- ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
- }
- for (c = 1; c < n; c++) {
- for (r = n - 1; r >= n / 2 + c / 2; --r) {
- w = c + (n - 1 - r) * 2;
- ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
- ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
- }
- }
-}
-
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c, h, w, v;
- int a, b;
- c = 0;
- for (r = 0; r < n - 2; r++) {
- if (r & 1)
- a = yabove_row[c + 1];
- else
- a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
- b = yleft_col[r + 2];
- ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);
- }
- for (c = 1; c < n / 2 - 1; c++) {
- for (r = 0; r < n - 2 - 2 * c; r++) {
- if (r & 1)
- a = yabove_row[c + 1];
- else
- a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
- b = ypred_ptr[(r + 2) * y_stride + c - 1];
- ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
- }
- }
- for (; c < n - 1; ++c) {
- for (r = 0; r < n; r++) {
- v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);
- w = c - r / 2;
- ypred_ptr[r * y_stride + w] = v;
- }
- }
- r = 0;
- c = n - 1;
- ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;
- for (c = n - 2; c >= n / 2; --c) {
- h = r + (n - 1 - c) * 2;
- ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
- ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
- }
- for (r = 1; r < n; r++) {
- for (c = n - 1; c >= n / 2 + r / 2; --c) {
- h = r + (n - 1 - c) * 2;
- ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
- ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
- }
- }
-}
-
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c;
- for (r = 0; r < n - 1; ++r) {
- for (c = 0; c <= r; ++c) {
- ypred_ptr[(r - c) * y_stride + c] =
- (yabove_row[r + 1] * (c + 1) +
- yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);
- }
- }
- for (c = 0; c <= r; ++c) {
- int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
- int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
- yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
- yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
- ypred_ptr[(r - c) * y_stride + c] =
- (yabove_ext * (c + 1) +
- yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
- }
- for (r = 1; r < n; ++r) {
- for (c = n - r; c < n; ++c)
- ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
- ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
- }
-}
-
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c;
- for (c = 0; c < n; c++)
- ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;
- ypred_ptr += y_stride;
- for (c = 0; c < n; c++)
- ypred_ptr[c] = yabove_row[c - 1];
- ypred_ptr += y_stride;
- for (r = 2; r < n; ++r) {
- ypred_ptr[0] = yleft_col[r - 2];
- for (c = 1; c < n; c++)
- ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
- ypred_ptr += y_stride;
- }
-}
-
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c;
- ypred_ptr[0] = yabove_row[-1];
- for (c = 1; c < n; c++)
- ypred_ptr[c] = yabove_row[c - 1];
- for (r = 1; r < n; ++r)
- ypred_ptr[r * y_stride] = yleft_col[r - 1];
-
- ypred_ptr += y_stride;
- for (r = 1; r < n; ++r) {
- for (c = 1; c < n; c++) {
- ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
- }
- ypred_ptr += y_stride;
- }
-}
-
-static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
- uint8_t *yabove_row, uint8_t *yleft_col) {
- int r, c;
- ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;
- for (r = 1; r < n; r++)
- ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;
- ypred_ptr++;
- ypred_ptr[0] = yabove_row[-1];
- for (r = 1; r < n; r++)
- ypred_ptr[r * y_stride] = yleft_col[r - 1];
- ypred_ptr++;
-
- for (c = 0; c < n - 2; c++)
- ypred_ptr[c] = yabove_row[c];
- ypred_ptr += y_stride;
- for (r = 1; r < n; ++r) {
- for (c = 0; c < n - 2; c++)
- ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
- ypred_ptr += y_stride;
- }
-}
-
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
- int i;
-
- for (i = 16; i < 24; i += 2) {
- BLOCKD *b = &xd->block[i];
- vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
- }
-}
-
-void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
- unsigned char *ypred_ptr,
- int y_stride, int mode, int bsize,
- int up_available, int left_available) {
-
- unsigned char *yabove_row = src - src_stride;
- unsigned char yleft_col[32];
- unsigned char ytop_left = yabove_row[-1];
- int r, c, i;
-
- for (i = 0; i < bsize; i++) {
- yleft_col[i] = src[i * src_stride - 1];
- }
-
- /* for Y */
- switch (mode) {
- case DC_PRED: {
- int expected_dc;
- int i;
- int shift;
- int average = 0;
- int log2_bsize_minus_1;
-
- assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
- if (bsize == 4) {
- log2_bsize_minus_1 = 1;
- } else if (bsize == 8) {
- log2_bsize_minus_1 = 2;
- } else if (bsize == 16) {
- log2_bsize_minus_1 = 3;
- } else /* bsize == 32 */ {
- log2_bsize_minus_1 = 4;
- }
-
- if (up_available || left_available) {
- if (up_available) {
- for (i = 0; i < bsize; i++) {
- average += yabove_row[i];
- }
- }
-
- if (left_available) {
- for (i = 0; i < bsize; i++) {
- average += yleft_col[i];
- }
- }
- shift = log2_bsize_minus_1 + up_available + left_available;
- expected_dc = (average + (1 << (shift - 1))) >> shift;
- } else {
- expected_dc = 128;
- }
-
- for (r = 0; r < bsize; r++) {
- vpx_memset(ypred_ptr, expected_dc, bsize);
- ypred_ptr += y_stride;
- }
- }
- break;
- case V_PRED: {
- for (r = 0; r < bsize; r++) {
- memcpy(ypred_ptr, yabove_row, bsize);
- ypred_ptr += y_stride;
- }
- }
- break;
- case H_PRED: {
- for (r = 0; r < bsize; r++) {
- vpx_memset(ypred_ptr, yleft_col[r], bsize);
- ypred_ptr += y_stride;
- }
- }
- break;
- case TM_PRED: {
- for (r = 0; r < bsize; r++) {
- for (c = 0; c < bsize; c++) {
- int pred = yleft_col[r] + yabove_row[ c] - ytop_left;
-
- if (pred < 0)
- pred = 0;
-
- if (pred > 255)
- pred = 255;
-
- ypred_ptr[c] = pred;
- }
-
- ypred_ptr += y_stride;
- }
- }
- break;
- case D45_PRED: {
- d45_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case D135_PRED: {
- d135_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case D117_PRED: {
- d117_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case D153_PRED: {
- d153_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case D27_PRED: {
- d27_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case D63_PRED: {
- d63_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
- }
- break;
- case I8X8_PRED:
- case B_PRED:
- case NEARESTMV:
- case NEARMV:
- case ZEROMV:
- case NEWMV:
- case SPLITMV:
- case MB_MODE_COUNT:
- break;
- }
-}
-
-void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
- xd->predictor, 16,
- xd->mode_info_context->mbmi.mode, 16,
- xd->up_available, xd->left_available);
-}
-
-void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
- xd->dst.y_buffer, xd->dst.y_stride,
- xd->mode_info_context->mbmi.mode, 16,
- xd->up_available, xd->left_available);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
- xd->dst.y_buffer, xd->dst.y_stride,
- xd->mode_info_context->mbmi.mode, 32,
- xd->up_available, xd->left_available);
-}
-#endif
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
- unsigned char predictor[2][256];
- int i;
-
- vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
- predictor[0], 16,
- xd->mode_info_context->mbmi.mode,
- 16, xd->up_available,
- xd->left_available);
- vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
- predictor[1], 16,
- xd->mode_info_context->mbmi.second_mode,
- 16, xd->up_available,
- xd->left_available);
-
- for (i = 0; i < 256; i++) {
- xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
- }
-}
-#endif
-
-void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
- unsigned char *upred_ptr,
- unsigned char *vpred_ptr,
- int uv_stride,
- int mode, int bsize) {
- vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
- upred_ptr, uv_stride, mode, bsize,
- xd->up_available, xd->left_available);
- vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
- vpred_ptr, uv_stride, mode, bsize,
- xd->up_available, xd->left_available);
-}
-
-void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
- &xd->predictor[320], 8,
- xd->mode_info_context->mbmi.uv_mode,
- 8);
-}
-
-void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
- xd->dst.v_buffer,
- xd->dst.uv_stride,
- xd->mode_info_context->mbmi.uv_mode,
- 8);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
- vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
- xd->dst.v_buffer, xd->dst.uv_stride,
- xd->mode_info_context->mbmi.uv_mode,
- 16);
-}
-#endif
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
- unsigned char predictor[2][2][64];
- int i;
-
- vp9_build_intra_predictors_mbuv_internal(
- xd, predictor[0][0], predictor[1][0], 8,
- xd->mode_info_context->mbmi.uv_mode, 8);
- vp9_build_intra_predictors_mbuv_internal(
- xd, predictor[0][1], predictor[1][1], 8,
- xd->mode_info_context->mbmi.second_uv_mode, 8);
- for (i = 0; i < 64; i++) {
- xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
- xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
- predictor[1][1][i] + 1) >> 1;
- }
-}
-#endif
-
-void vp9_intra8x8_predict(BLOCKD *xd,
- int mode,
- unsigned char *predictor) {
- vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
- xd->dst_stride, predictor, 16,
- mode, 8, 1, 1);
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra8x8_predict(BLOCKD *xd,
- int mode, int second_mode,
- unsigned char *out_predictor) {
- unsigned char predictor[2][8 * 16];
- int i, j;
-
- vp9_intra8x8_predict(xd, mode, predictor[0]);
- vp9_intra8x8_predict(xd, second_mode, predictor[1]);
-
- for (i = 0; i < 8 * 16; i += 16) {
- for (j = i; j < i + 8; j++) {
- out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
- }
- }
-}
-#endif
-
-void vp9_intra_uv4x4_predict(BLOCKD *xd,
- int mode,
- unsigned char *predictor) {
- vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
- xd->dst_stride, predictor, 8,
- mode, 4, 1, 1);
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
- int mode, int mode2,
- unsigned char *out_predictor) {
- unsigned char predictor[2][8 * 4];
- int i, j;
-
- vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
- vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
-
- for (i = 0; i < 4 * 8; i += 8) {
- for (j = i; j < i + 4; j++) {
- out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
- }
- }
-}
-#endif
-
-/* TODO: try different ways of use Y-UV mode correlation
- Current code assumes that a uv 4x4 block use same mode
- as corresponding Y 8x8 area
- */
--- a/vp8/common/reconintra.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_RECONINTRA_H
-#define __INC_RECONINTRA_H
-
-#include "blockd.h"
-
-extern void init_intra_left_above_pixels(MACROBLOCKD *xd);
-
-#endif // __INC_RECONINTRA_H
--- a/vp8/common/reconintra4x4.c
+++ /dev/null
@@ -1,321 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "reconintra.h"
-#include "vpx_rtcd.h"
-
-void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,
- unsigned char *predictor) {
- int i, r, c;
-
- unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
- unsigned char Left[4];
- unsigned char top_left = Above[-1];
-
- Left[0] = (*(x->base_dst))[x->dst - 1];
- Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
- Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
- Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
-
- switch (b_mode) {
- case B_DC_PRED: {
- int expected_dc = 0;
-
- for (i = 0; i < 4; i++) {
- expected_dc += Above[i];
- expected_dc += Left[i];
- }
-
- expected_dc = (expected_dc + 4) >> 3;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- predictor[c] = expected_dc;
- }
-
- predictor += 16;
- }
- }
- break;
- case B_TM_PRED: {
- /* prediction similar to true_motion prediction */
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int pred = Above[c] - top_left + Left[r];
-
- if (pred < 0)
- pred = 0;
-
- if (pred > 255)
- pred = 255;
-
- predictor[c] = pred;
- }
-
- predictor += 16;
- }
- }
- break;
-
- case B_VE_PRED: {
-
- unsigned int ap[4];
- ap[0] = Above[0];
- ap[1] = Above[1];
- ap[2] = Above[2];
- ap[3] = Above[3];
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
-
- predictor[c] = ap[c];
- }
-
- predictor += 16;
- }
-
- }
- break;
-
-
- case B_HE_PRED: {
-
- unsigned int lp[4];
- lp[0] = Left[0];
- lp[1] = Left[1];
- lp[2] = Left[2];
- lp[3] = Left[3];
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- predictor[c] = lp[r];
- }
-
- predictor += 16;
- }
- }
- break;
- case B_LD_PRED: {
- unsigned char *ptr = Above;
- predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
- predictor[0 * 16 + 1] =
- predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
- predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
- }
- break;
- case B_RD_PRED: {
-
- unsigned char pp[9];
-
- pp[0] = Left[3];
- pp[1] = Left[2];
- pp[2] = Left[1];
- pp[3] = Left[0];
- pp[4] = top_left;
- pp[5] = Above[0];
- pp[6] = Above[1];
- pp[7] = Above[2];
- pp[8] = Above[3];
-
- predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[3 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
- }
- break;
- case B_VR_PRED: {
-
- unsigned char pp[9];
-
- pp[0] = Left[3];
- pp[1] = Left[2];
- pp[2] = Left[1];
- pp[3] = Left[0];
- pp[4] = top_left;
- pp[5] = Above[0];
- pp[6] = Above[1];
- pp[7] = Above[2];
- pp[8] = Above[3];
-
-
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
- predictor[3 * 16 + 2] =
- predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
- predictor[3 * 16 + 3] =
- predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
- predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
-
- }
- break;
- case B_VL_PRED: {
-
- unsigned char *pp = Above;
-
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
- predictor[1 * 16 + 1] =
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- }
- break;
-
- case B_HD_PRED: {
- unsigned char pp[9];
- pp[0] = Left[3];
- pp[1] = Left[2];
- pp[2] = Left[1];
- pp[3] = Left[0];
- pp[4] = top_left;
- pp[5] = Above[0];
- pp[6] = Above[1];
- pp[7] = Above[2];
- pp[8] = Above[3];
-
-
- predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- }
- break;
-
-
- case B_HU_PRED: {
- unsigned char *pp = Left;
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 0] =
- predictor[3 * 16 + 1] =
- predictor[3 * 16 + 2] =
- predictor[3 * 16 + 3] = pp[3];
- }
- break;
-
-
- }
-}
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra4x4_predict_c(BLOCKD *x,
- int b_mode, int b_mode2,
- unsigned char *out_predictor) {
- unsigned char predictor[2][4 * 16];
- int i, j;
-
- vp9_intra4x4_predict(x, b_mode, predictor[0]);
- vp9_intra4x4_predict(x, b_mode2, predictor[1]);
-
- for (i = 0; i < 16 * 4; i += 16) {
- for (j = i; j < i + 4; j++) {
- out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
- }
- }
-}
-#endif
-
-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
- * to the right prediction have filled in pixels to use.
- */
-void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
- int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
- unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
- xd->block[0].dst_stride + 16;
- unsigned int *src_ptr = (unsigned int *)
- (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
-
- unsigned int *dst_ptr0 = (unsigned int *)above_right;
- unsigned int *dst_ptr1 =
- (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
- unsigned int *dst_ptr2 =
- (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
- unsigned int *dst_ptr3 =
- (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
-
- if (extend_edge) {
- *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
- }
-
- *dst_ptr0 = *src_ptr;
- *dst_ptr1 = *src_ptr;
- *dst_ptr2 = *src_ptr;
- *dst_ptr3 = *src_ptr;
-}
--- a/vp8/common/reconintra4x4.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA4x4_H
-#define __INC_RECONINTRA4x4_H
-
-extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
-
-#endif
--- a/vp8/common/rtcd.c
+++ /dev/null
@@ -1,105 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "vpx_config.h"
-#define RTCD_C
-#include "vpx_rtcd.h"
-
-#if CONFIG_MULTITHREAD && defined(_WIN32)
-#include <windows.h>
-#include <stdlib.h>
-static void once(void (*func)(void))
-{
- static CRITICAL_SECTION *lock;
- static LONG waiters;
- static int done;
- void *lock_ptr = &lock;
-
- /* If the initialization is complete, return early. This isn't just an
- * optimization, it prevents races on the destruction of the global
- * lock.
- */
- if(done)
- return;
-
- InterlockedIncrement(&waiters);
-
- /* Get a lock. We create one and try to make it the one-true-lock,
- * throwing it away if we lost the race.
- */
-
- {
- /* Scope to protect access to new_lock */
- CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
- InitializeCriticalSection(new_lock);
- if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
- {
- DeleteCriticalSection(new_lock);
- free(new_lock);
- }
- }
-
- /* At this point, we have a lock that can be synchronized on. We don't
- * care which thread actually performed the allocation.
- */
-
- EnterCriticalSection(lock);
-
- if (!done)
- {
- func();
- done = 1;
- }
-
- LeaveCriticalSection(lock);
-
- /* Last one out should free resources. The destructed objects are
- * protected by checking if(done) above.
- */
- if(!InterlockedDecrement(&waiters))
- {
- DeleteCriticalSection(lock);
- free(lock);
- lock = NULL;
- }
-}
-
-
-#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
-#include <pthread.h>
-static void once(void (*func)(void))
-{
- static pthread_once_t lock = PTHREAD_ONCE_INIT;
- pthread_once(&lock, func);
-}
-
-
-#else
-/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
- * so as long as your platform provides atomic loads/stores of pointers
- * no synchronization is strictly necessary.
- */
-
-static void once(void (*func)(void))
-{
- static int done;
-
- if(!done)
- {
- func();
- done = 1;
- }
-}
-#endif
-
-
-void vpx_rtcd()
-{
- once(setup_rtcd_internal);
-}
--- a/vp8/common/rtcd_defs.sh
+++ /dev/null
@@ -1,482 +1,0 @@
-common_forward_decls() {
-cat <<EOF
-
-struct loop_filter_info;
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-
-/* Encoder forward decls */
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls common_forward_decls
-
-prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-
-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
-# compiles warning free but a dissassembly of generated code show bugs. To be
-# on the safe side, only enabled when compiled with 'gcc'.
-if [ "$CONFIG_GCC" = "yes" ]; then
- specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
-fi
- specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
- specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
- specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
-
-#
-# Dequant
-#
-prototype void vp9_dequantize_b "struct blockd *x"
-specialize vp9_dequantize_b mmx
-
-prototype void vp9_dequantize_b_2x2 "struct blockd *x"
-specialize vp9_dequantize_b_2x2
-
-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"
-specialize vp9_dequant_dc_idct_add_y_block_8x8
-
-prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_y_block_8x8
-
-prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_uv_block_8x8
-
-prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
-specialize vp9_dequant_idct_add_16x16
-
-prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
-specialize vp9_dequant_idct_add
-
-prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
-specialize vp9_dequant_dc_idct_add
-
-prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"
-specialize vp9_dequant_dc_idct_add_y_block mmx
-
-prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"
-specialize vp9_dequant_idct_add_y_block mmx
-
-prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"
-specialize vp9_dequant_idct_add_uv_block mmx
-
-#
-# RECON
-#
-prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
-vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-
-prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx media neon dspr2
-vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
-
-prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
-specialize vp9_intra4x4_predict
-
-prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_avg_mem16x16
-
-prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_avg_mem8x8
-
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx media neon dspr2
-vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
-
-prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon_b
-
-prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon_uv_b
-
-prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon2b sse2
-
-prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
-specialize vp9_recon4b sse2
-
-prototype void vp9_recon_mb "struct macroblockd *x"
-specialize vp9_recon_mb
-
-prototype void vp9_recon_mby "struct macroblockd *x"
-specialize vp9_recon_mby
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s
-
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sby_s;
-
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sbuv_s;
-
-prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby;
-
-prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mby;
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s;
-
-prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv;
-
-prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv_s;
-
-prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mbuv;
-
-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra4x4_predict;
-
-prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra4x4_predict;
-
-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra8x8_predict;
-
-prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
-specialize vp9_intra_uv4x4_predict;
-
-prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra_uv4x4_predict;
-
-#
-# Loopfilter
-#
-prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbv sse2
-
-prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv sse2
-
-prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv8x8 sse2
-
-prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbh sse2
-
-prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh sse2
-
-prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh8x8 sse2
-
-prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
-vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
-
-prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
-vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
-
-prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2 media neon
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
-vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
-
-prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2 media neon
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
-vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
-
-#
-# sad 16x3, 3x16
-#
-if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then
-prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad16x3 sse2
-
-prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
-specialize vp9_sad3x16 sse2
-fi
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
-
-
-# variance
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
-
-prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32
-
-prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x16 mmx sse2
-vp9_variance16x16_sse2=vp9_variance16x16_wmt
-vp9_variance16x16_mmx=vp9_variance16x16_mmx
-
-prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x8 mmx sse2
-vp9_variance16x8_sse2=vp9_variance16x8_wmt
-vp9_variance16x8_mmx=vp9_variance16x8_mmx
-
-prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x16 mmx sse2
-vp9_variance8x16_sse2=vp9_variance8x16_wmt
-vp9_variance8x16_mmx=vp9_variance8x16_mmx
-
-prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x8 mmx sse2
-vp9_variance8x8_sse2=vp9_variance8x8_wmt
-vp9_variance8x8_mmx=vp9_variance8x8_mmx
-
-prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x4 mmx sse2
-vp9_variance4x4_sse2=vp9_variance4x4_wmt
-vp9_variance4x4_mmx=vp9_variance4x4_mmx
-
-prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32
-
-prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
-vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
-
-prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
-
-prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
-
-prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
-
-prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32
-
-prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx sse2 sse3
-vp9_sad16x16_sse2=vp9_sad16x16_wmt
-
-prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx sse2
-vp9_sad16x8_sse2=vp9_sad16x8_wmt
-
-prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx sse2
-vp9_sad8x16_sse2=vp9_sad8x16_wmt
-
-prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx sse2
-vp9_sad8x8_sse2=vp9_sad8x8_wmt
-
-prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx sse2
-vp9_sad4x4_sse2=vp9_sad4x4_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
-vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
-vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
-
-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
-vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
-
-prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_h
-
-prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_v
-
-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_hv
-
-prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x3
-
-prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x3 sse3 ssse3
-
-prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x3 sse3 ssse3
-
-prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x3 sse3
-
-prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x3 sse3
-
-prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x3 sse3
-
-prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad32x32x8
-
-prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad16x16x8 sse4
-
-prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad16x8x8 sse4
-
-prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad8x16x8 sse4
-
-prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad8x8x8 sse4
-
-prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
-specialize vp9_sad4x4x8 sse4
-
-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x4d
-
-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x4d sse3
-
-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x4d sse3
-
-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x4d sse3
-
-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x4d sse3
-
-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x4d sse3
-
-#
-# Block copy
-#
-case $arch in
- x86*)
- prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
- specialize vp9_copy32xn sse2 sse3
- ;;
-esac
-
-prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
-vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
-
-prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse"
-specialize vp9_mse16x16 mmx sse2
-vp9_mse16x16_sse2=vp9_mse16x16_wmt
-
-prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp9_sub_pixel_mse32x32
-
-prototype unsigned int vp9_get_mb_ss "const short *"
-specialize vp9_get_mb_ss mmx sse2
-# ENCODEMB INVOKE
-prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
-specialize vp9_mbblock_error mmx sse2
-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
-
-prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
-
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype int vp9_mbuverror "struct macroblock *mb"
-specialize vp9_mbuverror mmx sse2
-vp9_mbuverror_sse2=vp9_mbuverror_xmm
-
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
-specialize vp9_subtract_mby mmx sse2
-
-prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
-specialize vp9_subtract_mbuv mmx sse2
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
- [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
- prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
-
- prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
-fi
-
-# fdct functions
-prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
-specialize vp9_fht
-
-prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct8x8
-
-prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fhaar2x2
-
-prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct4x4
-
-prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct8x4
-
-prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4
-
-prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_fdct16x16
-
-prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4_lossless
-
-prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh4x4_x8
-
-prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
-specialize vp9_short_walsh8x4_x8
-
-fi
-# end encoder functions
--- a/vp8/common/sadmxn.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_SAD_H
-#define __INC_SAD_H
-
-static __inline
-unsigned int sad_mx_n_c(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int m,
- int n) {
- int r, c;
- unsigned int sad = 0;
-
- for (r = 0; r < n; r++) {
- for (c = 0; c < m; c++) {
- sad += abs(src_ptr[c] - ref_ptr[c]);
- }
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- return sad;
-}
-
-#endif
--- a/vp8/common/seg_common.c
+++ /dev/null
@@ -1,103 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/seg_common.h"
-
-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
-static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };
-
-// These functions provide access to new segment level features.
-// Eventually these function may be "optimized out" but for the moment,
-// the coding mechanism is still subject to change so these provide a
-// convenient single point of change.
-
-int vp9_segfeature_active(const MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id) {
- // Return true if mask bit set and segmentation enabled.
- return (xd->segmentation_enabled &&
- (xd->segment_feature_mask[segment_id] &
- (0x01 << feature_id)));
-}
-
-void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
- vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
- vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
-}
-
-void vp9_enable_segfeature(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);
-}
-
-void vp9_disable_segfeature(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
-}
-
-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {
- return seg_feature_data_bits[feature_id];
-}
-
-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
- return (segfeaturedata_signed[feature_id]);
-}
-
-void vp9_clear_segdata(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_data[segment_id][feature_id] = 0;
-}
-
-void vp9_set_segdata(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id,
- int seg_data) {
- xd->segment_feature_data[segment_id][feature_id] = seg_data;
-}
-
-int vp9_get_segdata(const MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id) {
- return xd->segment_feature_data[segment_id][feature_id];
-}
-
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {
- xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;
-}
-
-void vp9_set_segref(MACROBLOCKD *xd,
- int segment_id,
- MV_REFERENCE_FRAME ref_frame) {
- xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=
- (1 << ref_frame);
-}
-
-int vp9_check_segref(const MACROBLOCKD *xd,
- int segment_id,
- MV_REFERENCE_FRAME ref_frame) {
- return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
- (1 << ref_frame)) ? 1 : 0;
-}
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {
- return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
- ~(1 << INTRA_FRAME)) ? 1 : 0;
-}
-
-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))
- return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);
- else
- return TX_4X4;
-}
-// TBD? Functions to read and write segment data with range / validity checking
--- a/vp8/common/seg_common.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "type_aliases.h"
-#include "onyxc_int.h"
-#include "vp8/common/blockd.h"
-
-#ifndef __INC_SEG_COMMON_H__
-#define __INC_SEG_COMMON_H__ 1
-
-int vp9_segfeature_active(const MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id);
-
-void vp9_clearall_segfeatures(MACROBLOCKD *xd);
-
-void vp9_enable_segfeature(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id);
-
-void vp9_disable_segfeature(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id);
-
-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);
-
-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
-
-void vp9_clear_segdata(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id);
-
-void vp9_set_segdata(MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id,
- int seg_data);
-
-int vp9_get_segdata(const MACROBLOCKD *xd,
- int segment_id,
- SEG_LVL_FEATURES feature_id);
-
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);
-
-void vp9_set_segref(MACROBLOCKD *xd,
- int segment_id,
- MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref(const MACROBLOCKD *xd,
- int segment_id,
- MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);
-
-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);
-
-#endif /* __INC_SEG_COMMON_H__ */
-
--- a/vp8/common/setupintrarecon.c
+++ /dev/null
@@ -1,31 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "setupintrarecon.h"
-#include "vpx_mem/vpx_mem.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
- int i;
-
- /* set up frame new frame for intra coded blocks */
- vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
- for (i = 0; i < ybf->y_height; i++)
- ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
-
- vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
- for (i = 0; i < ybf->uv_height; i++)
- ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
-
- vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
- for (i = 0; i < ybf->uv_height; i++)
- ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
-
-}
--- a/vp8/common/setupintrarecon.h
+++ /dev/null
@@ -1,13 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
--- a/vp8/common/subpixel.h
+++ /dev/null
@@ -1,204 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_H
-#define SUBPIXEL_H
-
-#define prototype_subpixel_predict(sym) \
- void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
- unsigned char *dst, int dst_pitch)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/subpixel_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/subpixel_arm.h"
-#endif
-
-#ifndef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);
-
-#ifndef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);
-
-#ifndef vp9_subpix_sixtap_avg16x16
-#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);
-
-#ifndef vp9_subpix_sixtap_avg8x8
-#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);
-#ifndef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);
-
-#ifndef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);
-
-#ifndef vp9_subpix_sixtap_avg4x4
-#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16
-#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);
-
-#ifndef vp9_subpix_eighttap8x8
-#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);
-
-#ifndef vp9_subpix_eighttap_avg16x16
-#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);
-
-#ifndef vp9_subpix_eighttap_avg8x8
-#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);
-
-#ifndef vp9_subpix_eighttap8x4
-#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);
-
-#ifndef vp9_subpix_eighttap4x4
-#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);
-
-#ifndef vp9_subpix_eighttap_avg4x4
-#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16_sharp
-#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);
-
-#ifndef vp9_subpix_eighttap8x8_sharp
-#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);
-
-#ifndef vp9_subpix_eighttap_avg16x16_sharp
-#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);
-
-#ifndef vp9_subpix_eighttap_avg8x8_sharp
-#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);
-
-#ifndef vp9_subpix_eighttap8x4_sharp
-#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);
-
-#ifndef vp9_subpix_eighttap4x4_sharp
-#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);
-
-#ifndef vp9_subpix_eighttap_avg4x4_sharp
-#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);
-
-#ifndef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);
-
-#ifndef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);
-
-#ifndef vp9_subpix_bilinear_avg16x16
-#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);
-
-#ifndef vp9_subpix_bilinear_avg8x8
-#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);
-
-#ifndef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);
-
-#ifndef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);
-
-#ifndef vp9_subpix_bilinear_avg4x4
-#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);
-
-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-typedef struct {
- vp9_subpix_fn_t eighttap16x16;
- vp9_subpix_fn_t eighttap8x8;
- vp9_subpix_fn_t eighttap_avg16x16;
- vp9_subpix_fn_t eighttap_avg8x8;
- vp9_subpix_fn_t eighttap_avg4x4;
- vp9_subpix_fn_t eighttap8x4;
- vp9_subpix_fn_t eighttap4x4;
- vp9_subpix_fn_t eighttap16x16_sharp;
- vp9_subpix_fn_t eighttap8x8_sharp;
- vp9_subpix_fn_t eighttap_avg16x16_sharp;
- vp9_subpix_fn_t eighttap_avg8x8_sharp;
- vp9_subpix_fn_t eighttap_avg4x4_sharp;
- vp9_subpix_fn_t eighttap8x4_sharp;
- vp9_subpix_fn_t eighttap4x4_sharp;
- vp9_subpix_fn_t sixtap16x16;
- vp9_subpix_fn_t sixtap8x8;
- vp9_subpix_fn_t sixtap_avg16x16;
- vp9_subpix_fn_t sixtap_avg8x8;
- vp9_subpix_fn_t sixtap8x4;
- vp9_subpix_fn_t sixtap4x4;
- vp9_subpix_fn_t sixtap_avg4x4;
- vp9_subpix_fn_t bilinear16x16;
- vp9_subpix_fn_t bilinear8x8;
- vp9_subpix_fn_t bilinear_avg16x16;
- vp9_subpix_fn_t bilinear_avg8x8;
- vp9_subpix_fn_t bilinear8x4;
- vp9_subpix_fn_t bilinear4x4;
- vp9_subpix_fn_t bilinear_avg4x4;
-} vp9_subpix_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn
-#endif
-
-#endif
--- a/vp8/common/swapyv12buffer.c
+++ /dev/null
@@ -1,32 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "swapyv12buffer.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
- YV12_BUFFER_CONFIG *last_frame) {
- unsigned char *temp;
-
- temp = last_frame->buffer_alloc;
- last_frame->buffer_alloc = new_frame->buffer_alloc;
- new_frame->buffer_alloc = temp;
-
- temp = last_frame->y_buffer;
- last_frame->y_buffer = new_frame->y_buffer;
- new_frame->y_buffer = temp;
-
- temp = last_frame->u_buffer;
- last_frame->u_buffer = new_frame->u_buffer;
- new_frame->u_buffer = temp;
-
- temp = last_frame->v_buffer;
- last_frame->v_buffer = new_frame->v_buffer;
- new_frame->v_buffer = temp;
-}
--- a/vp8/common/swapyv12buffer.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __SWAPYV12_BUFFER_H
-#define __SWAPYV12_BUFFER_H
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
- YV12_BUFFER_CONFIG *last_frame);
-
-#endif // __SWAPYV12_BUFFER_H
--- a/vp8/common/systemdependent.h
+++ /dev/null
@@ -1,21 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#if ARCH_X86 || ARCH_X86_64
-void vpx_reset_mmx_state(void);
-#define vp9_clear_system_state() vpx_reset_mmx_state()
-#else
-#define vp9_clear_system_state()
-#endif
-
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *);
--- a/vp8/common/tapify.py
+++ /dev/null
@@ -1,106 +1,0 @@
-"""
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
-"""
-#!/usr/bin/env python
-import sys,string,os,re,math,numpy
-scale = 2**16
-def dist(p1,p2):
- x1,y1 = p1
- x2,y2 = p2
- if x1==x2 and y1==y2 :
- return 1.0
- return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
-
-def gettaps(p):
- def l(b):
- return int(math.floor(b))
- def h(b):
- return int(math.ceil(b))
- def t(b,p,s):
- return int((scale*dist(b,p)+s/2)/s)
- r,c = p
- ul=[l(r),l(c)]
- ur=[l(r),h(c)]
- ll=[h(r),l(c)]
- lr=[h(r),h(c)]
- sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
- t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
- return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
- [ll,t(ll,p,sum)],[lr,t4]]
-
-def print_mb_taps(angle,blocksize):
- theta = angle / 57.2957795;
- affine = [[math.cos(theta),-math.sin(theta)],
- [math.sin(theta),math.cos(theta)]]
- radius = (float(blocksize)-1)/2
- print " // angle of",angle,"degrees"
- for y in range(blocksize) :
- for x in range(blocksize) :
- r,c = numpy.dot(affine,[y-radius, x-radius])
- tps = gettaps([r+radius,c+radius])
- for t in tps :
- p,t = t
- tr,tc = p
- print " %2d, %2d, %5d, " % (tr,tc,t,),
- print " // %2d,%2d " % (y,x)
-
-i=float(sys.argv[1])
-while i <= float(sys.argv[2]) :
- print_mb_taps(i,float(sys.argv[4]))
- i=i+float(sys.argv[3])
-"""
-
-taps = []
-pt=dict()
-ptr=dict()
-for y in range(16) :
- for x in range(16) :
- r,c = numpy.dot(affine,[y-7.5, x-7.5])
- tps = gettaps([r+7.5,c+7.5])
- j=0
- for tp in tps :
- p,i = tp
- r,c = p
- pt[y,x,j]= [p,i]
- try:
- ptr[r,j,c].append([y,x])
- except:
- ptr[r,j,c]=[[y,x]]
- j = j+1
-
-for key in sorted(pt.keys()) :
- print key,pt[key]
-
-lr = -99
-lj = -99
-lc = 0
-
-shuf=""
-mask=""
-for r,j,c in sorted(ptr.keys()) :
- for y,x in ptr[r,j,c] :
- if lr != r or lj != j :
- print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
- shuf=""
- lc = 0
- for i in range(lc,c-1) :
- shuf = shuf +"0"
- shuf = shuf + hex(x)[2]
- lc =c
- break
- lr = r
- lj = j
-# print r,j,c,ptr[r,j,c]
-
-for r,j,c in sorted(ptr.keys()) :
- for y,x in ptr[r,j,c] :
- print r,j,c,y,x
- break
-"""
--- a/vp8/common/textblit.c
+++ /dev/null
@@ -1,116 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-
-void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
- int letter_bitmap;
- unsigned char *output_pos = address;
- int colpos;
- const int font[] = {
- 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
- 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
- 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
- 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
- 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
- 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
- 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
- 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
- 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
- };
- colpos = 0;
-
- while (msg[colpos] != 0) {
- char letter = msg[colpos];
- int fontcol, fontrow;
-
- if (letter <= 'Z' && letter >= ' ')
- letter_bitmap = font[letter - ' '];
- else if (letter <= 'z' && letter >= 'a')
- letter_bitmap = font[letter - 'a' + 'A' - ' '];
- else
- letter_bitmap = font[0];
-
- for (fontcol = 6; fontcol >= 0; fontcol--)
- for (fontrow = 0; fontrow < 5; fontrow++)
- output_pos[fontrow * pitch + fontcol] =
- ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
-
- output_pos += 7;
- colpos++;
- }
-}
-
-static void plot(const int x, const int y, unsigned char *image, const int pitch) {
- image [x + y * pitch] ^= 255;
-}
-
-/* Bresenham line algorithm */
-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {
- int steep = abs(y1 - y0) > abs(x1 - x0);
- int deltax, deltay;
- int error, ystep, y, x;
-
- if (steep) {
- int t;
- t = x0;
- x0 = y0;
- y0 = t;
-
- t = x1;
- x1 = y1;
- y1 = t;
- }
-
- if (x0 > x1) {
- int t;
- t = x0;
- x0 = x1;
- x1 = t;
-
- t = y0;
- y0 = y1;
- y1 = t;
- }
-
- deltax = x1 - x0;
- deltay = abs(y1 - y0);
- error = deltax / 2;
-
- y = y0;
-
- if (y0 < y1)
- ystep = 1;
- else
- ystep = -1;
-
- if (steep) {
- for (x = x0; x <= x1; x++) {
- plot(y, x, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- } else {
- for (x = x0; x <= x1; x++) {
- plot(x, y, image, pitch);
-
- error = error - deltay;
- if (error < 0) {
- y = y + ystep;
- error = error + deltax;
- }
- }
- }
-}
--- a/vp8/common/treecoder.c
+++ /dev/null
@@ -1,138 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-
-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
-#include <assert.h>
-#endif
-#include <stdio.h>
-
-#include "treecoder.h"
-
-static void tree2tok(
- struct vp9_token_struct *const p,
- vp9_tree t,
- int i,
- int v,
- int L
-) {
- v += v;
- ++L;
-
- do {
- const vp9_tree_index j = t[i++];
-
- if (j <= 0) {
- p[-j].value = v;
- p[-j].Len = L;
- } else
- tree2tok(p, t, j, v, L);
- } while (++v & 1);
-}
-
-void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {
- tree2tok(p, t, 0, 0, 0);
-}
-
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,
- int offset) {
- tree2tok(p - offset, t, 0, 0, 0);
-}
-
-static void branch_counts(
- int n, /* n = size of alphabet */
- vp9_token tok [ /* n */ ],
- vp9_tree tree,
- unsigned int branch_ct [ /* n-1 */ ] [2],
- const unsigned int num_events[ /* n */ ]
-) {
- const int tree_len = n - 1;
- int t = 0;
-
-#if CONFIG_DEBUG
- assert(tree_len);
-#endif
-
- do {
- branch_ct[t][0] = branch_ct[t][1] = 0;
- } while (++t < tree_len);
-
- t = 0;
-
- do {
- int L = tok[t].Len;
- const int enc = tok[t].value;
- const unsigned int ct = num_events[t];
-
- vp9_tree_index i = 0;
-
- do {
- const int b = (enc >> --L) & 1;
- const int j = i >> 1;
-#if CONFIG_DEBUG
- assert(j < tree_len && 0 <= L);
-#endif
-
- branch_ct [j] [b] += ct;
- i = tree[ i + b];
- } while (i > 0);
-
-#if CONFIG_DEBUG
- assert(!L);
-#endif
- } while (++t < n);
-
-}
-
-
-void vp9_tree_probs_from_distribution(
- int n, /* n = size of alphabet */
- vp9_token tok [ /* n */ ],
- vp9_tree tree,
- vp9_prob probs [ /* n-1 */ ],
- unsigned int branch_ct [ /* n-1 */ ] [2],
- const unsigned int num_events[ /* n */ ],
- unsigned int Pfac,
- int rd
-) {
- const int tree_len = n - 1;
- int t = 0;
-
- branch_counts(n, tok, tree, branch_ct, num_events);
-
- do {
- const unsigned int *const c = branch_ct[t];
- const unsigned int tot = c[0] + c[1];
-
-#if CONFIG_DEBUG
- assert(tot < (1 << 24)); /* no overflow below */
-#endif
-
- if (tot) {
- const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
- probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
- } else
- probs[t] = vp9_prob_half;
- } while (++t < tree_len);
-}
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
- int tot_count = counts[0] + counts[1];
- vp9_prob prob;
- if (tot_count) {
- prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
- prob += !prob;
- } else {
- prob = 128;
- }
- return prob;
-}
--- a/vp8/common/treecoder.h
+++ /dev/null
@@ -1,75 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TREECODER_H
-#define __INC_TREECODER_H
-
-typedef unsigned char vp9_prob;
-
-#define vp9_prob_half ( (vp9_prob) 128)
-
-typedef signed char vp9_tree_index;
-struct bool_coder_spec;
-
-typedef struct bool_coder_spec bool_coder_spec;
-typedef struct bool_writer bool_writer;
-typedef struct bool_reader bool_reader;
-
-typedef const bool_coder_spec c_bool_coder_spec;
-typedef const bool_writer c_bool_writer;
-typedef const bool_reader c_bool_reader;
-
-
-
-# define vp9_complement( x) (255 - x)
-
-
-/* We build coding trees compactly in arrays.
- Each node of the tree is a pair of vp9_tree_indices.
- Array index often references a corresponding probability table.
- Index <= 0 means done encoding/decoding and value = -Index,
- Index > 0 means need another bit, specification at index.
- Nonnegative indices are always even; processing begins at node 0. */
-
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
-
-
-typedef const struct vp9_token_struct {
- int value;
- int Len;
-} vp9_token;
-
-/* Construct encoding array from tree. */
-
-void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
- int offset);
-
-
-/* Convert array of token occurrence counts into a table of probabilities
- for the associated binary encoding tree. Also writes count of branches
- taken for each node on the tree; this facilitiates decisions as to
- probability updates. */
-
-void vp9_tree_probs_from_distribution(
- int n, /* n = size of alphabet */
- vp9_token tok [ /* n */ ],
- vp9_tree tree,
- vp9_prob probs [ /* n-1 */ ],
- unsigned int branch_ct [ /* n-1 */ ] [2],
- const unsigned int num_events[ /* n */ ],
- unsigned int Pfactor,
- int Round
-);
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
-
-#endif
--- a/vp8/common/type_aliases.h
+++ /dev/null
@@ -1,120 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-* Module Title : type_aliases.h
-*
-* Description : Standard type aliases
-*
-****************************************************************************/
-#ifndef __INC_TYPE_ALIASES_H
-#define __INC_TYPE_ALIASES_H
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT extern /* Used to declare imported data & routines */
-#define PRIVATE static /* Used to declare & define module-local data */
-#define LOCAL static /* Used to define all persistent routine-local data */
-#define STD_IN_PATH 0 /* Standard input path */
-#define STD_OUT_PATH 1 /* Standard output path */
-#define STD_ERR_PATH 2 /* Standard error path */
-#define STD_IN_FILE stdin /* Standard input file pointer */
-#define STD_OUT_FILE stdout /* Standard output file pointer */
-#define STD_ERR_FILE stderr /* Standard error file pointer */
-#define max_int 0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL 0
-#else
-#define NULL ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int BOOL;
-#endif
-
-typedef unsigned char BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#ifndef INT64_MAX
-#define INT64_MAX LLONG_MAX
-#endif
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef double FLOAT64;
-typedef float FLOAT32;
-
-#endif
--- a/vp8/common/x86/filter_sse2.c
+++ /dev/null
@@ -1,289 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp8/common/filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vpx_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset) \
- { \
- /* Do shifted load to achieve require shuffles through unpacking */ \
- const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
- const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
- const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
- const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
- const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
- const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
- const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
- const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
- /* Shit by 4 bytes through suffle to get additional shifted loads */ \
- const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
- const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
- const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
- const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
- /* multiply accumulate them */ \
- const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
- const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
- const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
- const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
- // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
- // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
- // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
- const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
- const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
- // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
- // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
- // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
- // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
- const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
- const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
- // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(3, 2, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(3, 2, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- DECLARE_ALIGNED(16, unsigned char, temp[32]);
- {
- _mm_store_si128((__m128i *)temp, transpose3_0);
- DO_FOUR_PIXELS(col0, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_1);
- DO_FOUR_PIXELS(col1, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_2);
- DO_FOUR_PIXELS(col2, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_3);
- DO_FOUR_PIXELS(col3, temp, 0);
- }
- // transpose
- {
- __m128i T0 = _mm_unpacklo_epi32(col0, col1);
- __m128i T1 = _mm_unpacklo_epi32(col2, col3);
- __m128i T2 = _mm_unpackhi_epi32(col0, col1);
- __m128i T3 = _mm_unpackhi_epi32(col2, col3);
- col0 = _mm_unpacklo_epi64(T0, T1);
- col1 = _mm_unpackhi_epi64(T0, T1);
- col2 = _mm_unpacklo_epi64(T2, T3);
- col3 = _mm_unpackhi_epi64(T2, T3);
- }
- // saturate to 8 bit
- {
- col0 = _mm_packs_epi32(col0, col0);
- col0 = _mm_packus_epi16(col0, col0);
- col1 = _mm_packs_epi32(col1, col1);
- col1 = _mm_packus_epi16(col1, col1);
- col2 = _mm_packs_epi32 (col2, col2);
- col2 = _mm_packus_epi16(col2, col2);
- col3 = _mm_packs_epi32 (col3, col3);
- col3 = _mm_packus_epi16(col3, col3);
- }
- // store
- {
- *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
--- a/vp8/common/x86/filter_sse4.c
+++ /dev/null
@@ -1,362 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp8/common/filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vpx_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-// duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
- 0x00, 0x01,
- 0x01, 0x02,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x04, 0x05,
- 0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
- 0x04, 0x05,
- 0x05, 0x06,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x08, 0x09,
- 0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
- 0, 4, 8, 12,
- 1, 5, 9, 13,
- 2, 6, 10, 14,
- 3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset) \
- { \
- /*load pixels*/ \
- __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \
- /* extract the ones used for first column */ \
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \
- /* multiply accumulate them */ \
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
- const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
- const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
- const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
- const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
- const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(0, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(1, 1, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(2, 2, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(3, 3, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- {
- //load pixels
- __m128i src = transpose3_0;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col0 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_1;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col1 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_2;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col2 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_3;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col3 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- __m128i col01 = _mm_unpacklo_epi8(col0, col1);
- __m128i col23 = _mm_unpacklo_epi8(col2, col3);
- __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
- //TODO(cd): look into Ronald's comment:
- // Future suggestion: I believe here, too, you can merge the
- // packs_epi32() and pacus_epi16() for the 4 cols above, so that
- // you get the data in a single register, and then use pshufb
- // (shuffle_epi8()) instead of the unpacks here. Should be
- // 2+3+2 instructions faster.
- *((unsigned int *)&dst_ptr[dst_stride * 0]) =
- _mm_extract_epi32(col0123, 0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) =
- _mm_extract_epi32(col0123, 1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) =
- _mm_extract_epi32(col0123, 2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) =
- _mm_extract_epi32(col0123, 3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
--- a/vp8/common/x86/idct_x86.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef IDCT_X86_H
-#define IDCT_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
-extern prototype_idct(vp9_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
-
-extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
-
-#undef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
-
-#undef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
-
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
-
-#undef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx
-
-#endif
-#endif
-
-#if HAVE_SSE2
-
-extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2
-
-#endif
-
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/idctllm_mmx.asm
+++ /dev/null
@@ -1,241 +1,0 @@
-;
-; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2: times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16: times 4 dw 16
-
-SECTION .text
-
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; * 1. sqrt(2) * cos (pi/8)
-; * 2. sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; * x * a = x + x*(a-1)
-; * so
-; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-INIT_MMX
-
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
- mova m0, [inpq +0]
- mova m1, [inpq +8]
-
- mova m2, [inpq+16]
- mova m3, [inpq+24]
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2] ;
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1] ;
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova m3, m5 ; 33 23 13 03
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2] ;
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1] ;
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- paddw m0, [pw_16]
-
- paddw m2, [pw_16]
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
- psraw m2, 5
-
- psraw m0, 5
- psraw m4, 5
-
- psraw m6, 5
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova [outq], m0
-
- mova [outq+r2], m1
- mova [outq+pitq*2], m2
-
- add outq, pitq
- mova [outq+pitq*2], m5
- RET
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
- movh m0, [inpq]
- paddw m0, [pw_16]
- psraw m0, 5
- punpcklwd m0, m0
- punpckldq m0, m0
-
- mova [outq], m0
- mova [outq+pitq], m0
-
- mova [outq+pitq*2], m0
- add r1, r2
-
- mova [outq+pitq*2], m0
- RET
-
-
-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
-%if ARCH_X86_64
- movsxd strideq, dword stridem
-%else
- mov strideq, stridem
-%endif
- pxor m0, m0
-
- movh m5, in_dcq ; dc
- paddw m5, [pw_16]
-
- psraw m5, 5
-
- punpcklwd m5, m5
- punpckldq m5, m5
-
- movh m1, [predq]
- punpcklbw m1, m0
- paddsw m1, m5
- packuswb m1, m0 ; pack and unpack to saturate
- movh [dstq], m1
-
- movh m2, [predq+pitq]
- punpcklbw m2, m0
- paddsw m2, m5
- packuswb m2, m0 ; pack and unpack to saturate
- movh [dstq+strideq], m2
-
- movh m3, [predq+2*pitq]
- punpcklbw m3, m0
- paddsw m3, m5
- packuswb m3, m0 ; pack and unpack to saturate
- movh [dstq+2*strideq], m3
-
- add dstq, strideq
- add predq, pitq
- movh m4, [predq+2*pitq]
- punpcklbw m4, m0
- paddsw m4, m5
- packuswb m4, m0 ; pack and unpack to saturate
- movh [dstq+2*strideq], m4
- RET
-
--- a/vp8/common/x86/idctllm_sse2.asm
+++ /dev/null
@@ -1,712 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_idct_dequant_0_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *pre - 2
-; unsigned char *dst - 3
-; int dst_stride - 4
-; int blk_stride - 5
-; )
-
-global sym(vp9_idct_dequant_0_2x_sse2)
-sym(vp9_idct_dequant_0_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- ; end prolog
-
- mov rdx, arg(1) ; dequant
- mov rax, arg(0) ; qcoeff
-
- movd xmm4, [rax]
- movd xmm5, [rdx]
-
- pinsrw xmm4, [rax+32], 4
- pinsrw xmm5, [rdx], 4
-
- pmullw xmm4, xmm5
-
- ; Zero out xmm5, for use unpacking
- pxor xmm5, xmm5
-
- ; clear coeffs
- movd [rax], xmm5
- movd [rax+32], xmm5
-;pshufb
- pshuflw xmm4, xmm4, 00000000b
- pshufhw xmm4, xmm4, 00000000b
-
- mov rax, arg(2) ; pre
- paddw xmm4, [GLOBAL(fours)]
-
- movsxd rcx, dword ptr arg(5) ; blk_stride
- psraw xmm4, 3
-
- movq xmm0, [rax]
- movq xmm1, [rax+rcx]
- movq xmm2, [rax+2*rcx]
- lea rcx, [3*rcx]
- movq xmm3, [rax+rcx]
-
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
-
- mov rax, arg(3) ; dst
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
- ; Add to predict buffer
- paddw xmm0, xmm4
- paddw xmm1, xmm4
- paddw xmm2, xmm4
- paddw xmm3, xmm4
-
- ; pack up before storing
- packuswb xmm0, xmm5
- packuswb xmm1, xmm5
- packuswb xmm2, xmm5
- packuswb xmm3, xmm5
-
- ; store blocks back out
- movq [rax], xmm0
- movq [rax + rdx], xmm1
-
- lea rax, [rax + 2*rdx]
-
- movq [rax], xmm2
- movq [rax + rdx], xmm3
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_idct_dequant_full_2x_sse2)
-sym(vp9_idct_dequant_full_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
- movsxd rcx, dword ptr arg(5) ; blk_stride
-
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
- mov rdx, arg(1) ; dequant
-
- ; note the transpose of xmm1 and xmm2, necessary for shuffle
- ; to spit out sensicle data
- movdqa xmm0, [rax]
- movdqa xmm2, [rax+16]
- movdqa xmm1, [rax+32]
- movdqa xmm3, [rax+48]
-
- ; Clear out coeffs
- movdqa [rax], xmm7
- movdqa [rax+16], xmm7
- movdqa [rax+32], xmm7
- movdqa [rax+48], xmm7
-
- ; dequantize qcoeff buffer
- pmullw xmm0, [rdx]
- pmullw xmm2, [rdx+16]
- pmullw xmm1, [rdx]
- pmullw xmm3, [rdx+16]
-
- ; repack so block 0 row x and block 1 row x are together
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm4, xmm1
-
- pshufd xmm0, xmm0, 11011000b
- pshufd xmm1, xmm4, 11011000b
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- pshufd xmm2, xmm2, 11011000b
- pshufd xmm3, xmm4, 11011000b
-
- ; first pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2 ;
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
-
- ; transpose for the second pass
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- ; second pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- paddw xmm0, [GLOBAL(fours)]
-
- paddw xmm2, [GLOBAL(fours)]
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
- psraw xmm2, 3
-
- psraw xmm0, 3
- psraw xmm4, 3
-
- psraw xmm6, 3
-
- ; transpose to save
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- pxor xmm7, xmm7
-
- ; Load up predict blocks
- movq xmm4, [rsi]
- movq xmm5, [rsi+rcx]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
-
- movq xmm4, [rsi+2*rcx]
- lea rcx, [3*rcx]
- movq xmm5, [rsi+rcx]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
-.finish:
-
- ; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
-
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_idct_dequant_dc_0_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *pre - 2
-; unsigned char *dst - 3
-; int dst_stride - 4
-; short *dc - 5
-; )
-global sym(vp9_idct_dequant_dc_0_2x_sse2)
-sym(vp9_idct_dequant_dc_0_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
- mov rdx, arg(5) ; dc
-
- ; Zero out xmm5, for use unpacking
- pxor xmm5, xmm5
-
- ; load up 2 dc words here == 2*16 = doubleword
- movd xmm4, [rdx]
-
- ; Load up predict blocks
- movq xmm0, [rsi]
- movq xmm1, [rsi+16]
- movq xmm2, [rsi+32]
- movq xmm3, [rsi+48]
-
- ; Duplicate and expand dc across
- punpcklwd xmm4, xmm4
- punpckldq xmm4, xmm4
-
- ; Rounding to dequant and downshift
- paddw xmm4, [GLOBAL(fours)]
- psraw xmm4, 3
-
- ; Predict buffer needs to be expanded from bytes to words
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
-
- ; Add to predict buffer
- paddw xmm0, xmm4
- paddw xmm1, xmm4
- paddw xmm2, xmm4
- paddw xmm3, xmm4
-
- ; pack up before storing
- packuswb xmm0, xmm5
- packuswb xmm1, xmm5
- packuswb xmm2, xmm5
- packuswb xmm3, xmm5
-
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_idct_dequant_dc_full_2x_sse2)
-sym(vp9_idct_dequant_dc_full_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
-
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
- mov rdx, arg(1) ; dequant
-
- ; note the transpose of xmm1 and xmm2, necessary for shuffle
- ; to spit out sensicle data
- movdqa xmm0, [rax]
- movdqa xmm2, [rax+16]
- movdqa xmm1, [rax+32]
- movdqa xmm3, [rax+48]
-
- ; Clear out coeffs
- movdqa [rax], xmm7
- movdqa [rax+16], xmm7
- movdqa [rax+32], xmm7
- movdqa [rax+48], xmm7
-
- ; dequantize qcoeff buffer
- pmullw xmm0, [rdx]
- pmullw xmm2, [rdx+16]
- pmullw xmm1, [rdx]
- pmullw xmm3, [rdx+16]
-
- ; DC component
- mov rdx, arg(5)
-
- ; repack so block 0 row x and block 1 row x are together
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm4, xmm1
-
- pshufd xmm0, xmm0, 11011000b
- pshufd xmm1, xmm4, 11011000b
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- pshufd xmm2, xmm2, 11011000b
- pshufd xmm3, xmm4, 11011000b
-
- ; insert DC component
- pinsrw xmm0, [rdx], 0
- pinsrw xmm0, [rdx+2], 4
-
- ; first pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2 ;
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
-
- ; transpose for the second pass
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- ; second pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- paddw xmm0, [GLOBAL(fours)]
-
- paddw xmm2, [GLOBAL(fours)]
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
- psraw xmm2, 3
-
- psraw xmm0, 3
- psraw xmm4, 3
-
- psraw xmm6, 3
-
- ; transpose to save
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- pxor xmm7, xmm7
-
- ; Load up predict blocks
- movq xmm4, [rsi]
- movq xmm5, [rsi+16]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
-
- movq xmm4, [rsi+32]
- movq xmm5, [rsi+48]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
-.finish:
-
- ; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
-
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-fours:
- times 8 dw 0x0004
-align 16
-x_s1sqr2:
- times 8 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 8 dw 0x4E7B
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ /dev/null
@@ -1,173 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx)
-sym(vp9_short_inv_walsh4x4_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rax, 3
-
- mov rdi, arg(1)
- add rax, [rsi] ;input[0] + 3
-
- movd mm0, eax
-
- punpcklwd mm0, mm0 ;x x val val
-
- punpckldq mm0, mm0 ;val val val val
-
- psraw mm0, 3 ;(input[0] + 3) >> 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm0
- movq [rdi + 16], mm0
- movq [rdi + 24], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx)
-sym(vp9_short_inv_walsh4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rax, 3
- mov rsi, arg(0)
- mov rdi, arg(1)
- shl rax, 16
-
- movq mm0, [rsi + 0] ;ip[0]
- movq mm1, [rsi + 8] ;ip[4]
- or rax, 3 ;00030003h
-
- movq mm2, [rsi + 16] ;ip[8]
- movq mm3, [rsi + 24] ;ip[12]
-
- movq mm7, rax
- movq mm4, mm0
-
- punpcklwd mm7, mm7 ;0003000300030003h
- movq mm5, mm1
-
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm4 ;temp al
-
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
-
- ; 03 02 01 00
- ; 13 12 11 10
- ; 23 22 21 20
- ; 33 32 31 30
-
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
-
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
-
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
-
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm3, mm1 ; 03 02 01 00
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm4, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm4, mm5 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
-
- paddw mm0, mm7
- paddw mm1, mm7
- paddw mm2, mm7
- paddw mm3, mm7
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm1
- movq [rdi + 16], mm2
- movq [rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
--- a/vp8/common/x86/iwalsh_sse2.asm
+++ /dev/null
@@ -1,119 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2)
-sym(vp9_short_inv_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rdi, arg(1)
- mov rax, 3
-
- movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
- movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
-
- shl rax, 16
- or rax, 3 ;00030003h
-
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
-
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm3 ;d1 a1
- punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm6, eax
-
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;; movdqu [rdi + 0], xmm4
-;; movdqu [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
-
- pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
-
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm3 ;d1 a1
- punpckhqdq xmm5, xmm3 ;c1 b1
-
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm6
- paddw xmm1, xmm6
-
- psraw xmm5, 3
- psraw xmm1, 3
-
- movdqa [rdi + 0], xmm5
- movdqa [rdi + 16], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ /dev/null
@@ -1,969 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp9_loop_filter_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_horizontal_edge_mmx)
-sym(vp9_loop_filter_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_h:
- mov rdx, arg(3) ;limit
- movq mm7, [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
- ; calculate breakout conditions
- movq mm2, [rdi+2*rax] ; q3
- movq mm1, [rsi+2*rax] ; q2
- movq mm6, mm1 ; q2
- psubusb mm1, mm2 ; q2-=q3
- psubusb mm2, mm6 ; q3-=q2
- por mm1, mm2 ; abs(q3-q2)
- psubusb mm1, mm7 ;
-
-
- movq mm4, [rsi+rax] ; q1
- movq mm3, mm4 ; q1
- psubusb mm4, mm6 ; q1-=q2
- psubusb mm6, mm3 ; q2-=q1
- por mm4, mm6 ; abs(q2-q1)
-
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- psubusb mm4, mm3 ; q0-=q1
- psubusb mm3, mm0 ; q1-=q0
- por mm4, mm3 ; abs(q0-q1)
- movq t0, mm4 ; save to t0
- psubusb mm4, mm7
- por mm1, mm4
-
-
- neg rax ; negate pitch to deal with above border
-
- movq mm2, [rsi+4*rax] ; p3
- movq mm4, [rdi+4*rax] ; p2
- movq mm5, mm4 ; p2
- psubusb mm4, mm2 ; p2-=p3
- psubusb mm2, mm5 ; p3-=p2
- por mm4, mm2 ; abs(p3 - p2)
- psubusb mm4, mm7
- por mm1, mm4
-
-
- movq mm4, [rsi+2*rax] ; p1
- movq mm3, mm4 ; p1
- psubusb mm4, mm5 ; p1-=p2
- psubusb mm5, mm3 ; p2-=p1
- por mm4, mm5 ; abs(p2 - p1)
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm2, mm3 ; p1
-
- movq mm4, [rsi+rax] ; p0
- movq mm5, mm4 ; p0
- psubusb mm4, mm3 ; p0-=p1
- psubusb mm3, mm5 ; p1-=p0
- por mm4, mm3 ; abs(p1 - p0)
- movq t1, mm4 ; save to t1
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm3, [rdi] ; q1
- movq mm4, mm3 ; q1
- psubusb mm3, mm2 ; q1-=p1
- psubusb mm2, mm4 ; p1-=q1
- por mm2, mm3 ; abs(p1-q1)
- pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm2, 1 ; abs(p1-q1)/2
-
- movq mm6, mm5 ; p0
- movq mm3, [rsi] ; q0
- psubusb mm5, mm3 ; p0-=q0
- psubusb mm3, mm6 ; q0-=p0
- por mm5, mm3 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx] ; blimit
-
- psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm5
- pxor mm5, mm5
- pcmpeqb mm1, mm5 ; mask mm1
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx] ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
- paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb mm4, mm5
-
- pcmpeqb mm5, mm5
- pxor mm4, mm5
-
-
- ; start work on filters
- movq mm2, [rsi+2*rax] ; p1
- movq mm7, [rdi] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- pxor mm0, mm0 ;
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
- psraw mm5, 11
- packsswb mm0, mm5
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- movq mm6, [rsi+2*rax] ; p1
- pxor mm6, [GLOBAL(t80)] ; reoffset
- paddsb mm6, mm4 ; p1+= p1 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+2*rax], mm6 ; write back
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- movq [rdi], mm7 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .next8_h
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_vertical_edge_mmx)
-sym(vp9_loop_filter_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 64 ; reserve 64 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4 - 4]
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_v:
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
-
- ;transpose
- movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
- movq mm7, mm6 ; 77 76 75 74 73 72 71 70
-
- punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
- punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
-
- movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
- movq mm5, mm4 ; 47 46 45 44 43 42 41 40
-
- punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
- punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
-
- movq mm3, mm5 ; 57 47 56 46 55 45 54 44
- punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
-
- punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
- movq mm2, mm4 ; 53 43 52 42 51 41 50 40
-
- punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
- punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
-
- neg rax
- movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq mm1, mm6 ; 27 26 25 24 23 22 21 20
- punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
-
- punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
- movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
-
- punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
- movq mm0, mm7 ; 17 07 16 06 15 05 14 04
-
- punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
- punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
-
- movq mm6, mm7 ; 37 27 17 07 36 26 16 06
- punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
-
- punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
-
- movq mm5, mm6 ; 76 66 56 46 36 26 16 06
- psubusb mm5, mm7 ; q2-q3
-
- psubusb mm7, mm6 ; q3-q2
- por mm7, mm5; ; mm7=abs (q3-q2)
-
- movq mm5, mm0 ; 35 25 15 05 34 24 14 04
- punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
-
- punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
- movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
-
- psubusb mm3, mm6 ; q1-q2
- psubusb mm6, mm5 ; q2-q1
-
- por mm6, mm3 ; mm6=abs(q2-q1)
- lea rdx, srct
-
- movq [rdx+24], mm5 ; save q1
- movq [rdx+16], mm0 ; save q0
-
- movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
-
- movq mm0, mm3 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 31 21 11 01 30 20 10 00
-
- punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
- punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
-
- movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
- psubusb mm2, mm0 ; p2-p3
-
- psubusb mm0, mm1 ; p3-p2
- por mm0, mm2 ; mm0=abs(p3-p2)
-
- movq mm2, mm3 ; 33 23 13 03 32 22 12 02
- punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
-
- punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
- movq [rdx+8], mm3 ; save p0
-
- movq [rdx], mm2 ; save p1
- movq mm5, mm2 ; mm5 = p1
-
- psubusb mm2, mm1 ; p1-p2
- psubusb mm1, mm5 ; p2-p1
-
- por mm1, mm2 ; mm1=abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movq mm4, [rdx] ; mm4 = limit
- psubusb mm7, mm4
-
- psubusb mm0, mm4
- psubusb mm1, mm4
-
- psubusb mm6, mm4
- por mm7, mm6
-
- por mm0, mm1
- por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movq mm1, mm5 ; p1
-
- movq mm7, mm3 ; mm3=mm7=p0
- psubusb mm7, mm5 ; p0 - p1
-
- psubusb mm5, mm3 ; p1 - p0
- por mm5, mm7 ; abs(p1-p0)
-
- movq t0, mm5 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb mm5, mm4
- por mm0, mm5 ; mm0=mask
-
- movq mm5, [rdx+16] ; mm5=q0
- movq mm7, [rdx+24] ; mm7=q1
-
- movq mm6, mm5 ; mm6=q0
- movq mm2, mm7 ; q1
- psubusb mm5, mm7 ; q0-q1
-
- psubusb mm7, mm6 ; q1-q0
- por mm7, mm5 ; abs(q1-q0)
-
- movq t1, mm7 ; save abs(q1-q0)
- psubusb mm7, mm4
-
- por mm0, mm7 ; mask
-
- movq mm5, mm2 ; q1
- psubusb mm5, mm1 ; q1-=p1
- psubusb mm1, mm2 ; p1-=q1
- por mm5, mm1 ; abs(p1-q1)
- pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ;
-
- movq mm4, [rdx] ;blimit
- movq mm1, mm3 ; mm1=mm3=p0
-
- movq mm7, mm6 ; mm7=mm6=q0
- psubusb mm1, mm7 ; p0-q0
-
- psubusb mm7, mm3 ; q0-p0
- por mm1, mm7 ; abs(q0-p0)
- paddusb mm1, mm1 ; abs(q0-p0)*2
- paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm0; ; mask
-
- pxor mm0, mm0
- pcmpeqb mm1, mm0
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx]
- ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
-
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
-
- por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb mm4, mm0
-
- pcmpeqb mm0, mm0
- pxor mm4, mm0
-
-
-
- ; start work on filters
- lea rdx, srct
-
- movq mm2, [rdx] ; p1
- movq mm7, [rdx+24] ; q1
-
- movq mm6, [rdx+8] ; p0
- movq mm0, [rdx+16] ; q0
-
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
-
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
- pxor mm0, mm0 ;
-
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
-
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
-
- psraw mm5, 11
- packsswb mm0, mm5
-
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
-
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
-
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
-
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
-
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
- ; mm6=p0 ;
- movq mm1, [rdx] ; p1
- pxor mm1, [GLOBAL(t80)] ; reoffset
-
- paddsb mm1, mm4 ; p1+= p1 add
- pxor mm1, [GLOBAL(t80)] ; unoffset
- ; mm6 = p0 mm1 = p1
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; mm3 = q0
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- ; mm7 = q1
-
- ; tranpose and write back
- ; mm1 = 72 62 52 42 32 22 12 02
- ; mm6 = 73 63 53 43 33 23 13 03
- ; mm3 = 74 64 54 44 34 24 14 04
- ; mm7 = 75 65 55 45 35 25 15 05
-
- movq mm2, mm1 ; 72 62 52 42 32 22 12 02
- punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
-
- movq mm4, mm3 ; 74 64 54 44 34 24 14 04
- punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
-
- punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
- punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
-
- movq mm6, mm2 ; 33 32 23 22 13 12 03 02
- punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
-
- punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
- movq mm5, mm1 ; 73 72 63 62 53 52 43 42
-
- punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
- punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
-
-
- ; mm2 = 15 14 13 12 05 04 03 02
- ; mm6 = 35 34 33 32 25 24 23 22
- ; mm5 = 55 54 53 52 45 44 43 42
- ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
- movd [rsi+rax*4+2], mm2
- psrlq mm2, 32
-
- movd [rdi+rax*4+2], mm2
- movd [rsi+rax*2+2], mm6
-
- psrlq mm6, 32
- movd [rsi+rax+2],mm6
-
- movd [rsi+2], mm1
- psrlq mm1, 32
-
- movd [rdi+2], mm1
- neg rax
-
- movd [rdi+rax+2],mm5
- psrlq mm5, 32
-
- movd [rdi+rax*2+2], mm5
-
- lea rsi, [rsi+rax*8]
- dec rcx
- jnz .next8_v
-
- add rsp, 64
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rcx, 2 ; count
-.nexts8_h:
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm3, [rdx] ;
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movq mm1, [rsi+2*rax] ; p1
- movq mm0, [rdi] ; q1
- movq mm2, mm1
- movq mm7, mm0
- movq mm4, mm0
- psubusb mm0, mm1 ; q1-=p1
- psubusb mm1, mm4 ; p1-=q1
- por mm1, mm0 ; abs(p1-q1)
- pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm1, 1 ; abs(p1-q1)/2
-
- movq mm5, [rsi+rax] ; p0
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- movq mm6, mm5 ; p0
- psubusb mm5, mm4 ; p0-=q0
- psubusb mm4, mm6 ; q0-=p0
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm3, mm3
- pcmpeqb mm5, mm3
-
- ; start work on filters
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
- pand mm5, mm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- movq mm1, mm5 ; get a copy of filters
- psraw mm1, 11 ; arithmetic shift right 11
- psllw mm1, 8 ; shift left 8 to put it back
-
- por mm0, mm1 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .nexts8_h
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx)
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4- 2]; ;
- mov rcx, 2 ; count
-.nexts8_v:
-
- lea rdi, [rsi + rax];
- movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
-
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
-
- movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
- movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
-
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
- movq mm5, mm4 ; 53 43 52 42 51 41 50 40
-
- punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
- punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
-
- neg rax
-
- movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
-
- punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
- movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
-
- movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
- punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
-
- movq mm2, mm0 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 13 03 12 02 11 01 10 00
-
- punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
- movq mm3, mm2 ; 33 23 13 03 32 22 12 02
-
- punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
- punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
-
- punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
-
-
- ; calculate mask
- movq mm6, mm0 ; p1
- movq mm7, mm3 ; q1
- psubusb mm7, mm6 ; q1-=p1
- psubusb mm6, mm3 ; p1-=q1
- por mm6, mm7 ; abs(p1-q1)
- pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm6, 1 ; abs(p1-q1)/2
-
- movq mm5, mm1 ; p0
- movq mm4, mm2 ; q0
-
- psubusb mm5, mm2 ; p0-=q0
- psubusb mm4, mm1 ; q0-=p0
-
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx]
-
- psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm7, mm7
- pcmpeqb mm5, mm7 ; mm5 = mask
-
- ; start work on filters
- movq t0, mm0
- movq t1, mm3
-
- pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm0, mm3 ; p1 - q1
- movq mm6, mm1 ; p0
-
- movq mm7, mm2 ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm7 ; offseted ; q0
-
- psubsb mm7, mm6 ; q0 - p0
- paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand mm5, mm0 ; mask filter values we don't care about
-
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- movq mm7, mm5 ; get a copy of filters
- psraw mm7, 11 ; arithmetic shift right 11
- psllw mm7, 8 ; shift left 8 to put it back
-
- por mm0, mm7 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0sz add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
-
- movq mm0, t0
- movq mm4, t1
-
- ; mm0 = 70 60 50 40 30 20 10 00
- ; mm6 = 71 61 51 41 31 21 11 01
- ; mm3 = 72 62 52 42 32 22 12 02
- ; mm4 = 73 63 53 43 33 23 13 03
- ; transpose back to write out
-
- movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
-
- punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
- movq mm2, mm3 ;
-
- punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
- movq mm5, mm1 ; 71 70 61 60 51 50 41 40
-
- punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
- movq mm6, mm0 ; 31 30 21 20 11 10 01 00
-
- punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
- punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
-
- movd [rsi+rax*4], mm0 ; write 03 02 01 00
- punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
-
- psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
- punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
-
- movd [rdi+rax*4], mm0 ; write 13 12 11 10
- movd [rsi+rax*2], mm6 ; write 23 22 21 20
-
- psrlq mm6, 32 ; 33 32 31 30
- movd [rsi], mm1 ; write 43 42 41 40
-
- movd [rsi + rax], mm6 ; write 33 32 31 30
- neg rax
-
- movd [rsi + rax*2], mm5 ; write 63 62 61 60
- psrlq mm1, 32 ; 53 52 51 50
-
- movd [rdi], mm1 ; write out 53 52 51 50
- psrlq mm5, 32 ; 73 72 71 70
-
- movd [rdi + rax*2], mm5 ; write 73 72 71 70
-
- lea rsi, [rsi+rax*8] ; next 8
-
- dec rcx
- jnz .nexts8_v
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-; int y_stride,
-; loop_filter_info *lfi)
-;{
-;
-;
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
-SECTION_RODATA
-align 16
-tfe:
- times 8 db 0xfe
-align 16
-t80:
- times 8 db 0x80
-align 16
-t1s:
- times 8 db 0x01
-align 16
-t3:
- times 8 db 0x03
-align 16
-t4:
- times 8 db 0x04
-align 16
-ones:
- times 4 dw 0x0001
-align 16
-s27:
- times 4 dw 0x1b00
-align 16
-s18:
- times 4 dw 0x1200
-align 16
-s9:
- times 4 dw 0x0900
-align 16
-s63:
- times 4 dw 0x003f
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ /dev/null
@@ -1,1238 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
- movdqa xmm2, [rdi+2*rax] ; q3
- movdqa xmm1, [rsi+2*rax] ; q2
- movdqa xmm4, [rsi+rax] ; q1
- movdqa xmm5, [rsi] ; q0
- neg rax ; negate pitch to deal with above border
-%else
- movlps xmm2, [rsi + rcx*2] ; q3
- movlps xmm1, [rsi + rcx] ; q2
- movlps xmm4, [rsi] ; q1
- movlps xmm5, [rsi + rax] ; q0
-
- movhps xmm2, [rdi + rcx*2]
- movhps xmm1, [rdi + rcx]
- movhps xmm4, [rdi]
- movhps xmm5, [rdi + rax]
-
- lea rsi, [rsi + rax*4]
- lea rdi, [rdi + rax*4]
-
- movdqa XMMWORD PTR [rsp], xmm1 ; store q2
- movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
-%endif
-
- movdqa xmm6, xmm1 ; q2
- movdqa xmm3, xmm4 ; q1
-
- psubusb xmm1, xmm2 ; q2-=q3
- psubusb xmm2, xmm6 ; q3-=q2
-
- psubusb xmm4, xmm6 ; q1-=q2
- psubusb xmm6, xmm3 ; q2-=q1
-
- por xmm4, xmm6 ; abs(q2-q1)
- por xmm1, xmm2 ; abs(q3-q2)
-
- movdqa xmm0, xmm5 ; q0
- pmaxub xmm1, xmm4
-
- psubusb xmm5, xmm3 ; q0-=q1
- psubusb xmm3, xmm0 ; q1-=q0
-
- por xmm5, xmm3 ; abs(q0-q1)
- movdqa t0, xmm5 ; save to t0
-
- pmaxub xmm1, xmm5
-
-%if %1
- movdqa xmm2, [rsi+4*rax] ; p3
- movdqa xmm4, [rdi+4*rax] ; p2
- movdqa xmm6, [rsi+2*rax] ; p1
-%else
- movlps xmm2, [rsi + rax] ; p3
- movlps xmm4, [rsi] ; p2
- movlps xmm6, [rsi + rcx] ; p1
-
- movhps xmm2, [rdi + rax]
- movhps xmm4, [rdi]
- movhps xmm6, [rdi + rcx]
-
- movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
- movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
-%endif
-
- movdqa xmm5, xmm4 ; p2
- movdqa xmm3, xmm6 ; p1
-
- psubusb xmm4, xmm2 ; p2-=p3
- psubusb xmm2, xmm5 ; p3-=p2
-
- psubusb xmm3, xmm5 ; p1-=p2
- pmaxub xmm1, xmm4 ; abs(p3 - p2)
-
- psubusb xmm5, xmm6 ; p2-=p1
- pmaxub xmm1, xmm2 ; abs(p3 - p2)
-
- pmaxub xmm1, xmm5 ; abs(p2 - p1)
- movdqa xmm2, xmm6 ; p1
-
- pmaxub xmm1, xmm3 ; abs(p2 - p1)
-%if %1
- movdqa xmm4, [rsi+rax] ; p0
- movdqa xmm3, [rdi] ; q1
-%else
- movlps xmm4, [rsi + rcx*2] ; p0
- movhps xmm4, [rdi + rcx*2]
- movdqa xmm3, q1 ; q1
-%endif
-
- movdqa xmm5, xmm4 ; p0
- psubusb xmm4, xmm6 ; p0-=p1
-
- psubusb xmm6, xmm5 ; p1-=p0
-
- por xmm6, xmm4 ; abs(p1 - p0)
- mov rdx, arg(2) ; get blimit
-
- movdqa t1, xmm6 ; save to t1
-
- movdqa xmm4, xmm3 ; q1
- pmaxub xmm1, xmm6
-
- psubusb xmm3, xmm2 ; q1-=p1
- psubusb xmm2, xmm4 ; p1-=q1
-
- psubusb xmm1, xmm7
- por xmm2, xmm3 ; abs(p1-q1)
-
- movdqa xmm7, XMMWORD PTR [rdx] ; blimit
-
- movdqa xmm3, xmm0 ; q0
- pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
-
- mov rdx, arg(4) ; hev get thresh
-
- movdqa xmm6, xmm5 ; p0
- psrlw xmm2, 1 ; abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; p0-=q0
-
- psubusb xmm3, xmm6 ; q0-=p0
- por xmm5, xmm3 ; abs(p0 - q0)
-
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
-
- movdqa xmm4, t0 ; hev get abs (q1 - q0)
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
-
- paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- movdqa xmm2, XMMWORD PTR [rdx] ; hev
-
- psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- psubusb xmm4, xmm2 ; hev
-
- psubusb xmm3, xmm2 ; hev
- por xmm1, xmm5
-
- pxor xmm7, xmm7
- paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb xmm4, xmm5 ; hev
- pcmpeqb xmm3, xmm3 ; hev
-
- pcmpeqb xmm1, xmm7 ; mask xmm1
- pxor xmm4, xmm3 ; hev
-%endmacro
-
-%macro B_FILTER 1
-%if %1 == 0
- movdqa xmm2, p1 ; p1
- movdqa xmm7, q1 ; q1
-%elif %1 == 1
- movdqa xmm2, [rsi+2*rax] ; p1
- movdqa xmm7, [rdi] ; q1
-%elif %1 == 2
- lea rdx, srct
-
- movdqa xmm2, [rdx] ; p1
- movdqa xmm7, [rdx+48] ; q1
- movdqa xmm6, [rdx+16] ; p0
- movdqa xmm0, [rdx+32] ; q0
-%endif
-
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
-
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
-
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1
-
- paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- punpckhbw xmm5, xmm2 ; axbxcxdx
- punpcklbw xmm2, xmm2 ; exfxgxhx
-
- punpcklbw xmm0, xmm1 ; exfxgxhx
- psraw xmm5, 11 ; sign extended shift right by 3
-
- punpckhbw xmm1, xmm1 ; axbxcxdx
- psraw xmm2, 11 ; sign extended shift right by 3
-
- packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
- psraw xmm0, 11 ; sign extended shift right by 3
-
- psraw xmm1, 11 ; sign extended shift right by 3
- movdqa xmm5, xmm0 ; save results
-
- packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw xmm5, [GLOBAL(ones)]
-
- paddsw xmm1, [GLOBAL(ones)]
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw xmm1, 1 ; partial shifted one more time for 2nd tap
-
- paddsb xmm6, xmm2 ; p0+= p0 add
- packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-%if %1 == 0
- movdqa xmm1, p1 ; p1
-%elif %1 == 1
- movdqa xmm1, [rsi+2*rax] ; p1
-%elif %1 == 2
- movdqa xmm1, [rdx] ; p1
-%endif
- pandn xmm4, xmm5 ; high edge variance additive
- pxor xmm6, [GLOBAL(t80)] ; unoffset
-
- pxor xmm1, [GLOBAL(t80)] ; reoffset
- psubsb xmm3, xmm0 ; q0-= q0 add
-
- paddsb xmm1, xmm4 ; p1+= p1 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
-
- pxor xmm1, [GLOBAL(t80)] ; unoffset
- psubsb xmm7, xmm4 ; q1-= q1 add
-
- pxor xmm7, [GLOBAL(t80)] ; unoffset
-%if %1 == 0
- lea rsi, [rsi + rcx*2]
- lea rdi, [rdi + rcx*2]
- movq MMWORD PTR [rsi], xmm6 ; p0
- movhps MMWORD PTR [rdi], xmm6
- movq MMWORD PTR [rsi + rax], xmm1 ; p1
- movhps MMWORD PTR [rdi + rax], xmm1
- movq MMWORD PTR [rsi + rcx], xmm3 ; q0
- movhps MMWORD PTR [rdi + rcx], xmm3
- movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
- movhps MMWORD PTR [rdi + rcx*2],xmm7
-%elif %1 == 1
- movdqa [rsi+rax], xmm6 ; write back
- movdqa [rsi+2*rax], xmm1 ; write back
- movdqa [rsi], xmm3 ; write back
- movdqa [rdi], xmm7 ; write back
-%endif
-
-%endmacro
-
-
-;void vp9_loop_filter_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_horizontal_edge_sse2)
-sym(vp9_loop_filter_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step
-
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 1
- ; filter and write back the result
- B_FILTER 1
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_horizontal_edge_uv_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
-sym(vp9_loop_filter_horizontal_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
- %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
- %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
- %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
- %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ; u
- mov rdi, arg(5) ; v
- movsxd rax, dword ptr arg(1) ; src_pixel_step
- mov rcx, rax
- neg rax ; negate pitch to deal with above border
-
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 0
- ; filter and write back the result
- B_FILTER 0
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-%macro TRANSPOSE_16X8 2
- movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
- movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
- movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
- movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
- movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
- punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
- movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
- movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
- movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
- punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
- lea rsi, [rsi+rax*8]
-%else
- mov rsi, arg(5) ; v_ptr
-%endif
-
- movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
- punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
- punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
- lea rdi, [rdi+rax*8]
-%else
- lea rsi, [rsi - 4]
-%endif
-
- punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
- lea rdx, srct
-%else
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-%endif
-
- movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
- movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
- punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
- punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
- movdqa t0, xmm2 ; save to free XMM2
- movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
- movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
- movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
- movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
- punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
- movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
- punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
- movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
- punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
- movdqa xmm6, xmm1 ;
- punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
- punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
- movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
- punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- movdqa xmm0, xmm5
- punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
- punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
- punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
- movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
- punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- movdqa [rdx], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+16], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+32], xmm4 ; save 4
- movdqa [rdx+48], xmm5 ; save 5
- movdqa xmm1, t0 ; get
-
- movdqa xmm2, xmm1 ;
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
- movdqa [rdx+112], xmm7 ; save 7
-
- movdqa [rdx+96], xmm6 ; save 6
-
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- movdqa [rdx+32], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+48], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+64], xmm4 ; save 4
- movdqa [rdx+80], xmm5 ; save 5
- movdqa xmm1, t0 ; get
-
- movdqa xmm2, xmm1
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- movdqa [rdx+16], xmm1
-
- movdqa [rdx], xmm2
-%endif
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 1
- movdqa xmm0, xmm6 ; q2
- psubusb xmm0, xmm7 ; q2-q3
-
- psubusb xmm7, xmm6 ; q3-q2
- movdqa xmm4, xmm5 ; q1
-
- por xmm7, xmm0 ; abs (q3-q2)
- psubusb xmm4, xmm6 ; q1-q2
-
- movdqa xmm0, xmm1
- psubusb xmm6, xmm5 ; q2-q1
-
- por xmm6, xmm4 ; abs (q2-q1)
- psubusb xmm0, xmm2 ; p2 - p3;
-
- psubusb xmm2, xmm1 ; p3 - p2;
- por xmm0, xmm2 ; abs(p2-p3)
-%if %1
- movdqa xmm2, [rdx] ; p1
-%else
- movdqa xmm2, [rdx+32] ; p1
-%endif
- movdqa xmm5, xmm2 ; p1
- pmaxub xmm0, xmm7
-
- psubusb xmm5, xmm1 ; p1-p2
- psubusb xmm1, xmm2 ; p2-p1
-
- movdqa xmm7, xmm3 ; p0
- psubusb xmm7, xmm2 ; p0-p1
-
- por xmm1, xmm5 ; abs(p2-p1)
- pmaxub xmm0, xmm6
-
- pmaxub xmm0, xmm1
- movdqa xmm1, xmm2 ; p1
-
- psubusb xmm2, xmm3 ; p1-p0
- lea rdx, srct
-
- por xmm2, xmm7 ; abs(p1-p0)
-
- movdqa t0, xmm2 ; save abs(p1-p0)
-
- pmaxub xmm0, xmm2
-
-%if %1
- movdqa xmm5, [rdx+32] ; q0
- movdqa xmm7, [rdx+48] ; q1
-%else
- movdqa xmm5, [rdx+64] ; q0
- movdqa xmm7, [rdx+80] ; q1
-%endif
- mov rdx, arg(3) ; limit
-
- movdqa xmm6, xmm5 ; q0
- movdqa xmm2, xmm7 ; q1
-
- psubusb xmm5, xmm7 ; q0-q1
- psubusb xmm7, xmm6 ; q1-q0
-
- por xmm7, xmm5 ; abs(q1-q0)
-
- movdqa t1, xmm7 ; save abs(q1-q0)
-
- movdqa xmm4, XMMWORD PTR [rdx]; limit
-
- pmaxub xmm0, xmm7
- mov rdx, arg(2) ; blimit
-
- psubusb xmm0, xmm4
- movdqa xmm5, xmm2 ; q1
-
- psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm2 ; p1-=q1
-
- por xmm5, xmm1 ; abs(p1-q1)
- movdqa xmm1, xmm3 ; p0
-
- pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psubusb xmm1, xmm6 ; p0-q0
-
- psrlw xmm5, 1 ; abs(p1-q1)/2
- psubusb xmm6, xmm3 ; q0-p0
-
- movdqa xmm4, XMMWORD PTR [rdx]; blimit
-
- mov rdx, arg(4) ; get thresh
-
- por xmm1, xmm6 ; abs(q0-p0)
-
- movdqa xmm6, t0 ; get abs (q1 - q0)
-
- paddusb xmm1, xmm1 ; abs(q0-p0)*2
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
-
- movdqa xmm7, XMMWORD PTR [rdx]
-
- paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
-
- psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
-
- psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- por xmm1, xmm0 ; mask
- pcmpeqb xmm6, xmm0
-
- pxor xmm0, xmm0
- pcmpeqb xmm4, xmm4
-
- pcmpeqb xmm1, xmm0
- pxor xmm4, xmm6
-%endmacro
-
-%macro BV_TRANSPOSE 0
- ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
- movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
- punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
- punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
- punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
- ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
- ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
- ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
- movd [rsi+2], %1
- psrldq %1, 4
-
- movd [rdi+2], %1
- psrldq %1, 4
-
- movd [rsi+2*rax+2], %1
- psrldq %1, 4
-
- movd [rdi+2*rax+2], %1
-
- movd [rsi+4*rax+2], %2
- psrldq %2, 4
-
- movd [rdi+4*rax+2], %2
- psrldq %2, 4
-
- movd [rsi+2*rcx+2], %2
- psrldq %2, 4
-
- movd [rdi+2*rcx+2], %2
-%endmacro
-
-
-;void vp9_loop_filter_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_vertical_edge_sse2)
-sym(vp9_loop_filter_vertical_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ; src_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax*2+rax]
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 1, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
-
- ; start work on filters
- B_FILTER 2
-
- ; tranpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
- ; store 16-line result
-
- lea rdx, [rax]
- neg rdx
-
- BV_WRITEBACK xmm1, xmm5
-
- lea rsi, [rsi+rdx*8]
- lea rdi, [rdi+rdx*8]
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_vertical_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2)
-sym(vp9_loop_filter_vertical_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ; u_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax+2*rax]
-
- lea rdx, srct
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 0, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
-
- ; start work on filters
- B_FILTER 2
-
- ; tranpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
-
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
- ; store 16-line result
- BV_WRITEBACK xmm1, xmm5
-
- mov rsi, arg(0) ; u_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- mov rdx, arg(2) ;blimit
- movdqa xmm3, XMMWORD PTR [rdx]
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movdqa xmm1, [rsi+2*rax] ; p1
- movdqa xmm0, [rdi] ; q1
- movdqa xmm2, xmm1
- movdqa xmm7, xmm0
- movdqa xmm4, xmm0
- psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm4 ; p1-=q1
- por xmm1, xmm0 ; abs(p1-q1)
- pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm1, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, [rsi+rax] ; p0
- movdqa xmm4, [rsi] ; q0
- movdqa xmm0, xmm4 ; q0
- movdqa xmm6, xmm5 ; p0
- psubusb xmm5, xmm4 ; p0-=q0
- psubusb xmm4, xmm6 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm3, xmm3
- pcmpeqb xmm5, xmm3
-
- ; start work on filters
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb xmm2, xmm7 ; p1 - q1
-
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
- pand xmm5, xmm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- movdqa xmm1, xmm5 ; get a copy of filters
- psraw xmm1, 11 ; arithmetic shift right 11
- psllw xmm1, 8 ; shift left 8 to put it back
-
- por xmm0, xmm1 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqa [rsi], xmm3 ; write back
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqa [rsi+rax], xmm6 ; write back
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2)
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
- push rbp ; save old base pointer value.
- mov rbp, rsp ; set new base pointer value.
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx ; save callee-saved reg
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi - 2 ]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movd xmm2, [rdi] ; 13 12 11 10
- movd xmm3, [rcx] ; 53 52 51 50
- punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
- punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
-
- movd xmm4, [rsi + rax*2] ; 23 22 21 20
- movd xmm5, [rdx + rax*2] ; 63 62 61 60
- movd xmm6, [rdi + rax*2] ; 33 32 31 30
- movd xmm7, [rcx + rax*2] ; 73 72 71 70
- punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
- punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
-
- punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
- punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- movdqa xmm2, xmm0
- punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- movdqa t0, xmm0 ; save to t0
- movdqa t1, xmm2 ; save to t1
-
- lea rsi, [rsi + rax*8]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm4, [rsi] ; 83 82 81 80
- movd xmm1, [rdx] ; c3 c2 c1 c0
- movd xmm6, [rdi] ; 93 92 91 90
- movd xmm3, [rcx] ; d3 d2 d1 d0
- punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
- punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
-
- movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
- movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
- movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
- punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
- punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
-
- punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
- punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
- movdqa xmm1, xmm4
- punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
- movdqa xmm6, xmm4
- punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
- punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
- movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
-
- punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- ; calculate mask
- movdqa xmm6, xmm0 ; p1
- movdqa xmm7, xmm3 ; q1
- psubusb xmm7, xmm0 ; q1-=p1
- psubusb xmm6, xmm3 ; p1-=q1
- por xmm6, xmm7 ; abs(p1-q1)
- pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm6, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, xmm1 ; p0
- movdqa xmm4, xmm2 ; q0
- psubusb xmm5, xmm2 ; p0-=q0
- psubusb xmm4, xmm1 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm7, xmm7
- pcmpeqb xmm5, xmm7 ; mm5 = mask
-
- ; start work on filters
- movdqa t0, xmm0
- movdqa t1, xmm3
-
- pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb xmm0, xmm3 ; p1 - q1
- movdqa xmm6, xmm1 ; p0
-
- movdqa xmm7, xmm2 ; q0
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm7 ; offseted ; q0
-
- psubsb xmm7, xmm6 ; q0 - p0
- paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand xmm5, xmm0 ; mask filter values we don't care about
-
-
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
-
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
-
- movdqa xmm7, xmm5 ; get a copy of filters
- psraw xmm7, 11 ; arithmetic shift right 11
-
- psllw xmm7, 8 ; shift left 8 to put it back
- por xmm0, xmm7 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0sz add
- pxor xmm3, [GLOBAL(t80)] ; unoffset q0
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
- movdqa xmm0, xmm5 ; get a copy of filters
-
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
-
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
-
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset p0
-
- movdqa xmm0, t0 ; p1
- movdqa xmm4, t1 ; q1
-
- ; transpose back to write out
- ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm5, xmm3
- punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- ; write out order: xmm0 xmm2 xmm1 xmm3
- lea rdx, [rsi + rax*4]
-
- movd [rsi], xmm1 ; write the second 8-line result
- psrldq xmm1, 4
- movd [rdi], xmm1
- psrldq xmm1, 4
- movd [rsi + rax*2], xmm1
- psrldq xmm1, 4
- movd [rdi + rax*2], xmm1
-
- movd [rdx], xmm3
- psrldq xmm3, 4
- movd [rcx], xmm3
- psrldq xmm3, 4
- movd [rdx + rax*2], xmm3
- psrldq xmm3, 4
- movd [rcx + rax*2], xmm3
-
- neg rax
- lea rsi, [rsi + rax*8]
- neg rax
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd [rsi], xmm0 ; write the first 8-line result
- psrldq xmm0, 4
- movd [rdi], xmm0
- psrldq xmm0, 4
- movd [rsi + rax*2], xmm0
- psrldq xmm0, 4
- movd [rdi + rax*2], xmm0
-
- movd [rdx], xmm2
- psrldq xmm2, 4
- movd [rcx], xmm2
- psrldq xmm2, 4
- movd [rdx + rax*2], xmm2
- psrldq xmm2, 4
- movd [rcx + rax*2], xmm2
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-tfe:
- times 16 db 0xfe
-align 16
-t80:
- times 16 db 0x80
-align 16
-t1s:
- times 16 db 0x01
-align 16
-t3:
- times 16 db 0x03
-align 16
-t4:
- times 16 db 0x04
-align 16
-ones:
- times 8 dw 0x0001
-align 16
-s9:
- times 8 dw 0x0900
-align 16
-s63:
- times 8 dw 0x003f
--- a/vp8/common/x86/loopfilter_x86.c
+++ /dev/null
@@ -1,543 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> // SSE2
-#include "vpx_config.h"
-#include "vp8/common/loopfilter.h"
-
-prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
-
-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
-
-#if HAVE_MMX
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
-
-}
-
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,
- int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh,
- int count) {
- DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- __m128i mask, hev, flat;
- __m128i thresh, limit, blimit;
- const __m128i zero = _mm_set1_epi16(0);
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-
- thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);
- limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);
- blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);
-
- p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
- {
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
- flat = _mm_max_epu8(work, flat);
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- }
- {
- const __m128i four = _mm_set1_epi16(4);
- unsigned char *src = s;
- int i = 0;
- do {
- __m128i workp_a, workp_b, workp_shft;
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-
- workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op2[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op1[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op0[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- src += 8;
- } while (++i < count);
- }
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
-
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
-
- /* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
-
- filt = _mm_andnot_si128(hev, filt);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
- q0 = _mm_load_si128((__m128i *)flat_oq0);
- work_a = _mm_andnot_si128(flat, work_a);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
-
- work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- q1 = _mm_load_si128((__m128i *)flat_oq1);
- work_a = _mm_andnot_si128(flat, work_a);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
-
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q2 = _mm_load_si128((__m128i *)flat_oq2);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
- p0 = _mm_load_si128((__m128i *)flat_op0);
- work_a = _mm_andnot_si128(flat, work_a);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
-
- work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- p1 = _mm_load_si128((__m128i *)flat_op1);
- work_a = _mm_andnot_si128(flat, work_a);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
-
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p2 = _mm_load_si128((__m128i *)flat_op2);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
-
- if (count == 1) {
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
- _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
- } else {
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
- }
- }
-}
-
-static __inline void transpose(unsigned char *src[], int in_p,
- unsigned char *dst[], int out_p,
- int num_8x8_to_transpose) {
- int idx8x8 = 0;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- do {
- unsigned char *in = src[idx8x8];
- unsigned char *out = dst[idx8x8];
-
- x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
- x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
- x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
- x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
- x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
- // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- x0 = _mm_unpacklo_epi8(x0, x1);
- // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- x1 = _mm_unpacklo_epi8(x2, x3);
- // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(x4, x5);
- // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(x6, x7);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- x4 = _mm_unpacklo_epi16(x0, x1);
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- x5 = _mm_unpacklo_epi16(x2, x3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- x6 = _mm_unpacklo_epi32(x4, x5);
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi32(x4, x5);
-
- _mm_storel_pd((double *)(out + 0*out_p),
- _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
- _mm_storeh_pd((double *)(out + 1*out_p),
- _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
- _mm_storel_pd((double *)(out + 2*out_p),
- _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
- _mm_storeh_pd((double *)(out + 3*out_p),
- _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
-
- // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- x4 = _mm_unpackhi_epi16(x0, x1);
- // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- x5 = _mm_unpackhi_epi16(x2, x3);
- // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- x6 = _mm_unpacklo_epi32(x4, x5);
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi32(x4, x5);
-
- _mm_storel_pd((double *)(out + 4*out_p),
- _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
- _mm_storeh_pd((double *)(out + 5*out_p),
- _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
- _mm_storel_pd((double *)(out + 6*out_p),
- _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
- _mm_storeh_pd((double *)(out + 7*out_p),
- _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
- } while (++idx8x8 < num_8x8_to_transpose);
-}
-
-void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count) {
- DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
- unsigned char *src[4];
- unsigned char *dst[4];
-
- src[0] = s - 5;
- src[1] = s - 5 + 8;
- src[2] = s - 5 + p*8;
- src[3] = s - 5 + p*8 + 8;
-
- dst[0] = t_dst;
- dst[1] = t_dst + 16*8;
- dst[2] = t_dst + 8;
- dst[3] = t_dst + 16*8 + 8;
-
- // 16x16->16x16 or 16x8->8x16
- transpose(src, p, dst, 16, (1 << count));
-
- vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
- thresh, count);
-
- dst[0] = s - 5;
- dst[1] = s - 5 + p*8;
-
- src[0] = t_dst;
- src[1] = t_dst + 8;
-
- // 16x8->8x16 or 8x8->8x8
- transpose(src, 16, dst, p, (1 << (count - 1)));
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 2);
-
- /* TODO: write sse2 version with u,v interleaved */
- if (u_ptr)
- vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_horizontal_edge_c_sse2(
- y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
-
- /* TODO: write sse2 version with u,v interleaved */
- if (u_ptr)
- vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
- lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
- unsigned char *v_ptr, int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_mbloop_filter_vertical_edge_c_sse2(
- y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr,
- v_ptr + 4 * uv_stride);
-}
-
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
- unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride,
- struct loop_filter_info *lfi) {
- vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr,
- v_ptr + 4);
-}
-
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
-
-#endif
--- a/vp8/common/x86/loopfilter_x86.h
+++ /dev/null
@@ -1,43 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LOOPFILTER_X86_H
-#define LOOPFILTER_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
-#endif
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
-#endif
-
-#endif // LOOPFILTER_X86_H
--- a/vp8/common/x86/mask_sse3.asm
+++ /dev/null
@@ -1,484 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void int vp8_makemask_sse3(
-; unsigned char *y,
-; unsigned char *u,
-; unsigned char *v,
-; unsigned char *ym,
-; unsigned char *uvm,
-; int yp,
-; int uvp,
-; int ys,
-; int us,
-; int vs,
-; int yt,
-; int ut,
-; int vt)
-global sym(vp8_makemask_sse3)
-sym(vp8_makemask_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 14
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;y
- mov rdi, arg(1) ;u
- mov rcx, arg(2) ;v
- mov rax, arg(3) ;ym
- movsxd rbx, dword arg(4) ;yp
- movsxd rdx, dword arg(5) ;uvp
-
- pxor xmm0,xmm0
-
- ;make 16 copies of the center y value
- movd xmm1, arg(6)
- pshufb xmm1, xmm0
-
- ; make 16 copies of the center u value
- movd xmm2, arg(7)
- pshufb xmm2, xmm0
-
- ; make 16 copies of the center v value
- movd xmm3, arg(8)
- pshufb xmm3, xmm0
- unpcklpd xmm2, xmm3
-
- ;make 16 copies of the y tolerance
- movd xmm3, arg(9)
- pshufb xmm3, xmm0
-
- ;make 16 copies of the u tolerance
- movd xmm4, arg(10)
- pshufb xmm4, xmm0
-
- ;make 16 copies of the v tolerance
- movd xmm5, arg(11)
- pshufb xmm5, xmm0
- unpckhpd xmm4, xmm5
-
- mov r8,8
-
-NextPairOfRows:
-
- ;grab the y source values
- movdqu xmm0, [rsi]
-
- ;compute abs difference between source and y target
- movdqa xmm6, xmm1
- movdqa xmm7, xmm0
- psubusb xmm0, xmm1
- psubusb xmm6, xmm7
- por xmm0, xmm6
-
- ;compute abs difference between
- movdqa xmm6, xmm3
- pcmpgtb xmm6, xmm0
-
- ;grab the y source values
- add rsi, rbx
- movdqu xmm0, [rsi]
-
- ;compute abs difference between source and y target
- movdqa xmm11, xmm1
- movdqa xmm7, xmm0
- psubusb xmm0, xmm1
- psubusb xmm11, xmm7
- por xmm0, xmm11
-
- ;compute abs difference between
- movdqa xmm11, xmm3
- pcmpgtb xmm11, xmm0
-
-
- ;grab the u and v source values
- movdqu xmm7, [rdi]
- movdqu xmm8, [rcx]
- unpcklpd xmm7, xmm8
-
- ;compute abs difference between source and uv targets
- movdqa xmm9, xmm2
- movdqa xmm10, xmm7
- psubusb xmm7, xmm2
- psubusb xmm9, xmm10
- por xmm7, xmm9
-
- ;check whether the number is < tolerance
- movdqa xmm0, xmm4
- pcmpgtb xmm0, xmm7
-
- ;double u and v masks
- movdqa xmm8, xmm0
- punpckhbw xmm0, xmm0
- punpcklbw xmm8, xmm8
-
- ;mask row 0 and output
- pand xmm6, xmm8
- pand xmm6, xmm0
- movdqa [rax],xmm6
-
- ;mask row 1 and output
- pand xmm11, xmm8
- pand xmm11, xmm0
- movdqa [rax+16],xmm11
-
-
- ; to the next row or set of rows
- add rsi, rbx
- add rdi, rdx
- add rcx, rdx
- add rax,32
- dec r8
- jnz NextPairOfRows
-
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;GROW_HORIZ (register for result, source register or mem local)
-; takes source and shifts left and ors with source
-; then shifts right and ors with source
-%macro GROW_HORIZ 2
- movdqa %1, %2
- movdqa xmm14, %1
- movdqa xmm15, %1
- pslldq xmm14, 1
- psrldq xmm15, 1
- por %1,xmm14
- por %1,xmm15
-%endmacro
-;GROW_VERT (result, center row, above row, below row)
-%macro GROW_VERT 4
- movdqa %1,%2
- por %1,%3
- por %1,%4
-%endmacro
-
-;GROW_NEXTLINE (new line to grow, new source, line to write)
-%macro GROW_NEXTLINE 3
- GROW_HORIZ %1, %2
- GROW_VERT xmm3, xmm0, xmm1, xmm2
- movdqa %3,xmm3
-%endmacro
-
-
-;void int vp8_growmaskmb_sse3(
-; unsigned char *om,
-; unsigned char *nm,
-global sym(vp8_growmaskmb_sse3)
-sym(vp8_growmaskmb_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src
- mov rdi, arg(1) ;rst
-
- GROW_HORIZ xmm0, [rsi]
- GROW_HORIZ xmm1, [rsi+16]
- GROW_HORIZ xmm2, [rsi+32]
-
- GROW_VERT xmm3, xmm0, xmm1, xmm2
- por xmm0,xmm1
- movdqa [rdi], xmm0
- movdqa [rdi+16],xmm3
-
- GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
- GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
- GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
- GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
- GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
- GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
- GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
- GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
- GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
- GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
- GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
- GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
- GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
-
- por xmm0,xmm2
- movdqa [rdi+240], xmm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int vp8_sad16x16_masked_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt)
-sym(vp8_sad16x16_masked_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rbx, arg(4) ;mask
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-NextSadRow:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
- pand xmm0, xmm2
- pand xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm3, xmm0
-
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz NextSadRow
-
- movdqa xmm4 , xmm3
- psrldq xmm4, 8
- paddw xmm3, xmm4
- movq rax, xmm3
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_sad16x16_unmasked_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt)
-sym(vp8_sad16x16_unmasked_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rbx, arg(4) ;mask
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-next_vp8_sad16x16_unmasked_wmt:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
- por xmm0, xmm2
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm3, xmm0
-
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz next_vp8_sad16x16_unmasked_wmt
-
- movdqa xmm4 , xmm3
- psrldq xmm4, 8
- paddw xmm3, xmm4
- movq rax, xmm3
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_masked_predictor_wmt(
-; unsigned char *masked,
-; unsigned char *unmasked,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; unsigned char *mask)
-global sym(vp8_masked_predictor_wmt)
-sym(vp8_masked_predictor_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;ref_ptr
-
- mov rbx, arg(5) ;mask
- movsxd rax, dword ptr arg(2) ;src_stride
- mov r11, arg(3) ; destination
- movsxd rdx, dword ptr arg(4) ;dst_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-next_vp8_masked_predictor_wmt:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
-
- pand xmm0, xmm2
- pandn xmm2, xmm1
- por xmm0, xmm2
- movdqu [r11], xmm0
-
- add r11, rdx
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz next_vp8_masked_predictor_wmt
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp8_masked_predictor_uv_wmt(
-; unsigned char *masked,
-; unsigned char *unmasked,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt)
-sym(vp8_masked_predictor_uv_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;ref_ptr
-
- mov rbx, arg(5) ;mask
- movsxd rax, dword ptr arg(2) ;src_stride
- mov r11, arg(3) ; destination
- movsxd rdx, dword ptr arg(4) ;dst_stride
-
- mov rcx, 8
-
- pxor xmm3, xmm3
-
-next_vp8_masked_predictor_uv_wmt:
- movq xmm0, [rsi]
- movq xmm1, [rdi]
- movq xmm2, [rbx]
-
- pand xmm0, xmm2
- pandn xmm2, xmm1
- por xmm0, xmm2
- movq [r11], xmm0
-
- add r11, rdx
- add rsi, rax
- add rdi, rax
- add rbx, 8
-
- dec rcx
- jnz next_vp8_masked_predictor_uv_wmt
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_uv_from_y_mask(
-; unsigned char *ymask,
-; unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask)
-sym(vp8_uv_from_y_mask):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
-
- mov rcx, 8
-
- pxor xmm3, xmm3
-
-next_p8_uv_from_y_mask:
- movdqu xmm0, [rsi]
- pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
- movq [rdi],xmm0
- add rdi, 8
- add rsi,32
-
- dec rcx
- jnz next_p8_uv_from_y_mask
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
--- a/vp8/common/x86/postproc_mmx.asm
+++ /dev/null
@@ -1,534 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx)
-sym(vp9_post_proc_down_and_across_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movq mm0, [GLOBAL(rd)]
- sub rsp, 8
- movq [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
- push rbx
- lea rbx, [GLOBAL(Blur)]
- movd mm2, dword ptr arg(6) ;flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
-
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
- movq mm3, [rsi] ; mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
- movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
- pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = r0 p0..p3
- psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
- movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- neg rax
- movq mm6, [rbx ] ; kernel 0 taps
- movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16] ; kernel 1 taps
- movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
-
- movd [rdi], mm1 ;
- neg rax ; pitch is positive
-
-
- add rsi, 4
- add rdi, 4
- add rdx, 4
-
- cmp edx, dword ptr arg(5) ;cols
- jl .nextcol
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
-
- push rax
- xor rdx, rdx
- mov rax, [rdi-4];
-
-.acrossnextcol:
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ;
- movq mm4, [rdi+rdx] ; mm4 = p0..p7
- movq mm3, mm4 ; mm3 = p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48]
- psrlq mm4, 8 ; mm4 = p1..p7
- movq mm5, mm4 ; mm5 = p1..p7
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = p0..p3
- psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ]
- psrlq mm4, 8 ; mm4 = p2..p7
- movq mm5, mm4 ; mm5 = p2..p7
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- movq mm6, [rbx ]
- movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
- movq mm5, mm4 ; mm5 = p-2..p5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16]
- psrlq mm4, 8 ; mm4 = p-1..p5
- punpcklbw mm4, mm0 ; mm4 = p-1..p2
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
- mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
- movd eax, mm1
-
- add rdx, 4
- cmp edx, dword ptr arg(5) ;cols
- jl .acrossnextcol;
-
- mov DWORD PTR [rdi+rdx-4], eax
- pop rax
-
- ; done with this rwo
- add rsi,rax ; next line
- movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx)
-sym(vp9_mbpost_proc_down_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 136
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp9_rv))]
-%endif
-
- ;rows +=8;
- add dword ptr arg(2), 8
-
- ;for(c=0; c<cols; c+=4)
-.loop_col:
- mov rsi, arg(0) ;s
- pxor mm0, mm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movd mm1, DWORD PTR [rdi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
- movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [rsi+rax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp9_rv))]
- movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
- movq mm4, [sym(vp9_rv) + rcx*2]
-%endif
- paddw mm1, mm4
- ;paddw xmm1, eight8s
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and rcx, 15
- movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
- movd [rsi], mm1
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
-
- add dword arg(0), 4 ; s += 4
- sub dword arg(3), 4 ; cols -= 4
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 136
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-; unsigned char blackclamp[16],
-; unsigned char whiteclamp[16],
-; unsigned char bothclamp[16],
-; unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
-global sym(vp9_plane_add_noise_mmx)
-sym(vp9_plane_add_noise_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-.addnoise_loop:
- call sym(rand) WRT_PLT
- mov rcx, arg(1) ;noise
- and rax, 0xff
- add rcx, rax
-
- ; we rely on the fact that the clamping vectors are stored contiguously
- ; in black/white/both order. Note that we have to reload this here because
- ; rdx could be trashed by rand()
- mov rdx, arg(2) ; blackclamp
-
-
- mov rdi, rcx
- movsxd rcx, dword arg(5) ;[Width]
- mov rsi, arg(0) ;Pos
- xor rax,rax
-
-.addnoise_nextset:
- movq mm1,[rsi+rax] ; get the source
-
- psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
- paddusb mm1, [rdx+32] ;bothclamp
- psubusb mm1, [rdx+16] ;whiteclamp
-
- movq mm2,[rdi+rax] ; get the noise for this line
- paddb mm1,mm2 ; add it in
- movq [rsi+rax],mm1 ; store the result
-
- add rax,8 ; move to the next line
-
- cmp rax, rcx
- jl .addnoise_nextset
-
- movsxd rax, dword arg(7) ; Pitch
- add arg(0), rax ; Start += Pitch
- sub dword arg(6), 1 ; Height -= 1
- jg .addnoise_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-Blur:
- times 16 dw 16
- times 8 dw 64
- times 16 dw 16
- times 8 dw 0
-
-rd:
- times 4 dw 0x40
--- a/vp8/common/x86/postproc_sse2.asm
+++ /dev/null
@@ -1,695 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_post_proc_down_and_across_xmm
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp9_post_proc_down_and_across_xmm)
-sym(vp9_post_proc_down_and_across_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ALIGN_STACK 16, rax
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movdqa xmm0, [GLOBAL(rd42)]
- sub rsp, 16
- movdqa [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
-
-
- movd xmm2, dword ptr arg(6) ;flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor xmm0, xmm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
- movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- neg rax
- movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi], xmm1 ;
-
- neg rax ; pitch is positive
- add rsi, 8
- add rdi, 8
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
-
- jl .nextcol
-
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
- xor rdx, rdx
- movq mm0, QWORD PTR [rdi-8];
-
-.acrossnextcol:
- movq xmm7, QWORD PTR [rdi +rdx -2]
- movd xmm4, DWORD PTR [rdi +rdx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ; mm5 = p1..p4
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = p0..p3
- psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ; mm5 = p2..p5
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- movdqa xmm5, xmm4 ; mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- psrldq xmm4, 1 ; mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
- movdq2q mm0, xmm1
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
- jl .acrossnextcol;
-
- ; last 8 pixels
- movq QWORD PTR [rdi+rdx-8], mm0
-
- ; done with this rwo
- add rsi,rax ; next line
- mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- add rsp,16
- pop rsp
-%endif
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD42
-
-
-;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_xmm)
-sym(vp9_mbpost_proc_down_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 128+16
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
- mov [rsp+128+8], eax
- mov [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp9_rv))]
-%endif
-
- ;rows +=8;
- add dword arg(2), 8
-
- ;for(c=0; c<cols; c+=8)
-.loop_col:
- mov rsi, arg(0) ; s
- pxor xmm0, xmm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movq xmm1, QWORD PTR [rdi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
- movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [rsi+rax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp9_rv))]
- movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
- movdqu xmm4, [sym(vp9_rv) + rcx*2]
-%endif
-
- paddw xmm1, xmm4
- ;paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and rcx, 15
- movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movq mm0, [rsp + rcx*8] ;d[rcx*8]
-
- movq [rsi], mm0
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
- add dword arg(0), 8 ; s += 8
- sub dword arg(3), 8 ; cols -= 8
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 128+16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
-; int pitch, int rows, int cols,int flimit)
-global sym(vp9_mbpost_proc_across_ip_xmm)
-sym(vp9_mbpost_proc_across_ip_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16
-
- ; create flimit4 at [rsp]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp], eax
- mov [rsp+4], eax
- mov [rsp+8], eax
- mov [rsp+12], eax
-%define flimit4 [rsp]
-
-
- ;for(r=0;r<rows;r++)
-.ip_row_loop:
-
- xor rdx, rdx ;sumsq=0;
- xor rcx, rcx ;sum=0;
- mov rsi, arg(0); s
- mov rdi, -8
-.ip_var_loop:
- ;for(i=-8;i<=6;i++)
- ;{
- ; sumsq += s[i]*s[i];
- ; sum += s[i];
- ;}
- movzx eax, byte [rsi+rdi]
- add ecx, eax
- mul al
- add edx, eax
- add rdi, 1
- cmp rdi, 6
- jle .ip_var_loop
-
-
- ;mov rax, sumsq
- ;movd xmm7, rax
- movd xmm7, edx
-
- ;mov rax, sum
- ;movd xmm6, rax
- movd xmm6, ecx
-
- mov rsi, arg(0) ;s
- xor rcx, rcx
-
- movsxd rdx, dword arg(3) ;cols
- add rdx, 8
- pxor mm0, mm0
- pxor mm1, mm1
-
- pxor xmm0, xmm0
-.nextcol4:
-
- movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
- movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
-
- punpcklbw xmm1, xmm0 ; expanding
- punpcklbw xmm2, xmm0 ; expanding
-
- punpcklwd xmm1, xmm0 ; expanding to dwords
- punpcklwd xmm2, xmm0 ; expanding to dwords
-
- psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
- paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
-
- paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
- pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
-
- paddd xmm6, xmm2
- paddd xmm7, xmm1
-
- pshufd xmm6, xmm6, 0 ; duplicate the last ones
- pshufd xmm7, xmm7, 0 ; duplicate the last ones
-
- psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
- psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
-
- pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
- pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
-
- paddd xmm6, xmm4
- paddd xmm7, xmm3
-
- pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
- pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
- pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- movdqa xmm3, xmm6
- pmaddwd xmm3, xmm3
-
- movdqa xmm5, xmm7
- pslld xmm5, 4
-
- psubd xmm5, xmm7
- psubd xmm5, xmm3
-
- psubd xmm5, flimit4
- psrad xmm5, 31
-
- packssdw xmm5, xmm0
- packsswb xmm5, xmm0
-
- movd xmm1, DWORD PTR [rsi+rcx]
- movq xmm2, xmm1
-
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
-
- paddd xmm1, xmm6
- paddd xmm1, [GLOBAL(four8s)]
-
- psrad xmm1, 4
- packssdw xmm1, xmm0
-
- packuswb xmm1, xmm0
- pand xmm1, xmm5
-
- pandn xmm5, xmm2
- por xmm5, xmm1
-
- movd [rsi+rcx-8], mm0
- movq mm0, mm1
-
- movdq2q mm1, xmm5
- psrldq xmm7, 12
-
- psrldq xmm6, 12
- add rcx, 4
-
- cmp rcx, rdx
- jl .nextcol4
-
- ;s+=pitch;
- movsxd rax, dword arg(1)
- add arg(0), rax
-
- sub dword arg(2), 1 ;rows-=1
- cmp dword arg(2), 0
- jg .ip_row_loop
-
- add rsp, 16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-; unsigned char blackclamp[16],
-; unsigned char whiteclamp[16],
-; unsigned char bothclamp[16],
-; unsigned int Width, unsigned int Height, int Pitch)
-extern sym(rand)
-global sym(vp9_plane_add_noise_wmt)
-sym(vp9_plane_add_noise_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-.addnoise_loop:
- call sym(rand) WRT_PLT
- mov rcx, arg(1) ;noise
- and rax, 0xff
- add rcx, rax
-
- ; we rely on the fact that the clamping vectors are stored contiguously
- ; in black/white/both order. Note that we have to reload this here because
- ; rdx could be trashed by rand()
- mov rdx, arg(2) ; blackclamp
-
-
- mov rdi, rcx
- movsxd rcx, dword arg(5) ;[Width]
- mov rsi, arg(0) ;Pos
- xor rax,rax
-
-.addnoise_nextset:
- movdqu xmm1,[rsi+rax] ; get the source
-
- psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
- paddusb xmm1, [rdx+32] ;bothclamp
- psubusb xmm1, [rdx+16] ;whiteclamp
-
- movdqu xmm2,[rdi+rax] ; get the noise for this line
- paddb xmm1,xmm2 ; add it in
- movdqu [rsi+rax],xmm1 ; store the result
-
- add rax,16 ; move to the next line
-
- cmp rax, rcx
- jl .addnoise_nextset
-
- movsxd rax, dword arg(7) ; Pitch
- add arg(0), rax ; Start += Pitch
- sub dword arg(6), 1 ; Height -= 1
- jg .addnoise_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd42:
- times 8 dw 0x04
-four8s:
- times 4 dd 8
--- a/vp8/common/x86/postproc_x86.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef POSTPROC_X86_H
-#define POSTPROC_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);
-extern prototype_postproc(vp9_post_proc_down_and_across_mmx);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_mmx
-
-#undef vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx
-
-#undef vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);
-extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);
-extern prototype_postproc(vp9_post_proc_down_and_across_xmm);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_xmm
-
-#undef vp9_postproc_across
-#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm
-
-#undef vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm
-
-#undef vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_wmt
-
-
-#endif
-#endif
-
-#endif
--- a/vp8/common/x86/recon_mmx.asm
+++ /dev/null
@@ -1,321 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx)
-sym(vp9_recon_b_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor mm0, mm0
-
- movd mm1, [rsi]
- punpcklbw mm1, mm0
- paddsw mm1, [rdx]
- packuswb mm1, mm0 ; pack and unpack to saturate
- movd [rdi], mm1
-
- movd mm2, [rsi+16]
- punpcklbw mm2, mm0
- paddsw mm2, [rdx+32]
- packuswb mm2, mm0 ; pack and unpack to saturate
- movd [rdi+rax], mm2
-
- movd mm3, [rsi+32]
- punpcklbw mm3, mm0
- paddsw mm3, [rdx+64]
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm3
-
- add rdi, rax
- movd mm4, [rsi+48]
- punpcklbw mm4, mm0
- paddsw mm4, [rdx+96]
- packuswb mm4, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm4
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x8_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x8_mmx)
-sym(vp9_copy_mem8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- add rsi, rax
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx*2], mm2
-
-
- lea rdi, [rdi+rcx*2]
- movq mm3, [rsi]
-
- add rdi, rcx
- movq mm4, [rsi+rax]
-
- movq mm5, [rsi+rax*2]
- movq [rdi], mm3
-
- lea rsi, [rsi+rax*2]
- movq [rdi+rcx], mm4
-
- movq [rdi+rcx*2], mm5
- lea rdi, [rdi+rcx*2]
-
- movq mm0, [rsi+rax]
- movq mm1, [rsi+rax*2]
-
- movq [rdi+rcx], mm0
- movq [rdi+rcx*2],mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x4_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x4_mmx)
-sym(vp9_copy_mem8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- movq [rdi+rcx], mm1
-
- movq [rdi+rcx*2], mm2
- lea rdi, [rdi+rcx*2]
-
- movq mm3, [rsi+rax]
- movq [rdi+rcx], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_mmx)
-sym(vp9_copy_mem16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movsxd rax, dword ptr arg(1) ;src_stride;
-
- mov rdi, arg(2) ;dst;
- movsxd rcx, dword ptr arg(3) ;dst_stride
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/common/x86/recon_sse2.asm
+++ /dev/null
@@ -1,688 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2)
-sym(vp9_recon2b_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor xmm0, xmm0
-
- movq xmm1, MMWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- paddsw xmm1, XMMWORD PTR [rdx]
- packuswb xmm1, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi], xmm1
-
-
- movq xmm2, MMWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
- paddsw xmm2, XMMWORD PTR [rdx+16]
- packuswb xmm2, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax], xmm2
-
-
- movq xmm3, MMWORD PTR [rsi+16]
- punpcklbw xmm3, xmm0
- paddsw xmm3, XMMWORD PTR [rdx+32]
- packuswb xmm3, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax*2], xmm3
-
- add rdi, rax
- movq xmm4, MMWORD PTR [rsi+24]
- punpcklbw xmm4, xmm0
- paddsw xmm4, XMMWORD PTR [rdx+48]
- packuswb xmm4, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax*2], xmm4
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2)
-sym(vp9_recon4b_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor xmm0, xmm0
-
- movdqa xmm1, XMMWORD PTR [rsi]
- movdqa xmm5, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm5, xmm0
- paddsw xmm1, XMMWORD PTR [rdx]
- paddsw xmm5, XMMWORD PTR [rdx+16]
- packuswb xmm1, xmm5 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi], xmm1
-
-
- movdqa xmm2, XMMWORD PTR [rsi+16]
- movdqa xmm6, xmm2
- punpcklbw xmm2, xmm0
- punpckhbw xmm6, xmm0
- paddsw xmm2, XMMWORD PTR [rdx+32]
- paddsw xmm6, XMMWORD PTR [rdx+48]
- packuswb xmm2, xmm6 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax], xmm2
-
-
- movdqa xmm3, XMMWORD PTR [rsi+32]
- movdqa xmm7, xmm3
- punpcklbw xmm3, xmm0
- punpckhbw xmm7, xmm0
- paddsw xmm3, XMMWORD PTR [rdx+64]
- paddsw xmm7, XMMWORD PTR [rdx+80]
- packuswb xmm3, xmm7 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax*2], xmm3
-
- add rdi, rax
- movdqa xmm4, XMMWORD PTR [rsi+48]
- movdqa xmm5, xmm4
- punpcklbw xmm4, xmm0
- punpckhbw xmm5, xmm0
- paddsw xmm4, XMMWORD PTR [rdx+96]
- paddsw xmm5, XMMWORD PTR [rdx+112]
- packuswb xmm4, xmm5 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax*2], xmm4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_sse2(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_sse2)
-sym(vp9_copy_mem16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movdqu xmm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movdqu xmm1, [rsi+rax]
- movdqu xmm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm3, [rsi]
-
- add rdi, rcx
- movdqu xmm4, [rsi+rax]
-
- movdqu xmm5, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm3
- add rsi, rax
-
- movdqa [rdi+rcx], xmm4
- movdqa [rdi+rcx*2],xmm5
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm0, [rsi]
-
- add rdi, rcx
- movdqu xmm1, [rsi+rax]
-
- movdqu xmm2, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
-
- movdqa [rdi+rcx*2], xmm2
- movdqu xmm3, [rsi]
-
- movdqu xmm4, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- add rdi, rcx
- movdqu xmm5, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm3
-
- add rsi, rax
- movdqa [rdi+rcx], xmm4
-
- movdqa [rdi+rcx*2],xmm5
- movdqu xmm0, [rsi]
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm1, [rsi+rax]
-
- add rdi, rcx
- movdqu xmm2, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm0
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- movdqu xmm3, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- movdqa [rdi+rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_intra_pred_uv_dc_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dc_mmx2)
-sym(vp9_intra_pred_uv_dc_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- ; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor mm0, mm0
- movq mm1, [rsi]
- psadbw mm1, mm0
-
- ; from left
- dec rsi
- lea rdi, [rax*3]
- movzx ecx, byte [rsi+rax]
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- lea rsi, [rsi+rax*4]
- movzx edx, byte [rsi]
- add ecx, edx
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- movzx edx, byte [rsi+rax*4]
- add ecx, edx
-
- ; add up
- pextrw edx, mm1, 0x0
- lea edx, [edx+ecx+8]
- sar edx, 4
- movd mm1, edx
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dctop_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dctop_mmx2)
-sym(vp9_intra_pred_uv_dctop_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor mm0, mm0
- movq mm1, [rsi]
- psadbw mm1, mm0
-
- ; add up
- paddw mm1, [GLOBAL(dc_4)]
- psraw mm1, 3
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dcleft_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dcleft_mmx2)
-sym(vp9_intra_pred_uv_dcleft_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- ; from left
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- dec rsi
- lea rdi, [rax*3]
- movzx ecx, byte [rsi]
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- lea rsi, [rsi+rax*4]
- movzx edx, byte [rsi]
- add ecx, edx
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- lea edx, [ecx+edx+4]
-
- ; add up
- shr edx, 3
- movd mm1, edx
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dc128_mmx(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dc128_mmx)
-sym(vp9_intra_pred_uv_dc128_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- ; end prolog
-
- ; write out
- movq mm1, [GLOBAL(dc_128)]
- mov rax, arg(0) ;dst;
- movsxd rdx, dword ptr arg(1) ;dst_stride
- lea rcx, [rdx*3]
-
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
- lea rax, [rax+rdx*4]
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_tm_sse2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-%macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1)
-sym(vp9_intra_pred_uv_tm_%1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; read top row
- mov edx, 4
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor xmm0, xmm0
-%ifidn %1, ssse3
- movdqa xmm2, [GLOBAL(dc_1024)]
-%endif
- movq xmm1, [rsi]
- punpcklbw xmm1, xmm0
-
- ; set up left ptrs ans subtract topleft
- movd xmm3, [rsi-1]
- lea rsi, [rsi+rax-1]
-%ifidn %1, sse2
- punpcklbw xmm3, xmm0
- pshuflw xmm3, xmm3, 0x0
- punpcklqdq xmm3, xmm3
-%else
- pshufb xmm3, xmm2
-%endif
- psubw xmm1, xmm3
-
- ; set up dest ptrs
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
-
-.vp9_intra_pred_uv_tm_%1_loop:
- movd xmm3, [rsi]
- movd xmm5, [rsi+rax]
-%ifidn %1, sse2
- punpcklbw xmm3, xmm0
- punpcklbw xmm5, xmm0
- pshuflw xmm3, xmm3, 0x0
- pshuflw xmm5, xmm5, 0x0
- punpcklqdq xmm3, xmm3
- punpcklqdq xmm5, xmm5
-%else
- pshufb xmm3, xmm2
- pshufb xmm5, xmm2
-%endif
- paddw xmm3, xmm1
- paddw xmm5, xmm1
- packuswb xmm3, xmm5
- movq [rdi ], xmm3
- movhps[rdi+rcx], xmm3
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rcx*2]
- dec edx
- jnz .vp9_intra_pred_uv_tm_%1_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%endmacro
-
-vp9_intra_pred_uv_tm sse2
-vp9_intra_pred_uv_tm ssse3
-
-;void vp9_intra_pred_uv_ve_mmx(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_ve_mmx)
-sym(vp9_intra_pred_uv_ve_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- ; end prolog
-
- ; read from top
- mov rax, arg(2) ;src;
- movsxd rdx, dword ptr arg(3) ;src_stride;
- sub rax, rdx
- movq mm1, [rax]
-
- ; write out
- mov rax, arg(0) ;dst;
- movsxd rdx, dword ptr arg(1) ;dst_stride
- lea rcx, [rdx*3]
-
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
- lea rax, [rax+rdx*4]
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
-
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_ho_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-%macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1)
-sym(vp9_intra_pred_uv_ho_%1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
-%ifidn %1, ssse3
-%ifndef GET_GOT_SAVE_ARG
- push rbx
-%endif
- GET_GOT rbx
-%endif
- ; end prolog
-
- ; read from left and write out
-%ifidn %1, mmx2
- mov edx, 4
-%endif
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
-%ifidn %1, ssse3
- lea rdx, [rcx*3]
- movdqa xmm2, [GLOBAL(dc_00001111)]
- lea rbx, [rax*3]
-%endif
- dec rsi
-%ifidn %1, mmx2
-.vp9_intra_pred_uv_ho_%1_loop:
- movd mm0, [rsi]
- movd mm1, [rsi+rax]
- punpcklbw mm0, mm0
- punpcklbw mm1, mm1
- pshufw mm0, mm0, 0x0
- pshufw mm1, mm1, 0x0
- movq [rdi ], mm0
- movq [rdi+rcx], mm1
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rcx*2]
- dec edx
- jnz .vp9_intra_pred_uv_ho_%1_loop
-%else
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
- punpcklbw xmm0, xmm3
- punpcklbw xmm1, xmm4
- pshufb xmm0, xmm2
- pshufb xmm1, xmm2
- movq [rdi ], xmm0
- movhps [rdi+rcx], xmm0
- movq [rdi+rcx*2], xmm1
- movhps [rdi+rdx], xmm1
- lea rsi, [rsi+rax*4]
- lea rdi, [rdi+rcx*4]
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
- punpcklbw xmm0, xmm3
- punpcklbw xmm1, xmm4
- pshufb xmm0, xmm2
- pshufb xmm1, xmm2
- movq [rdi ], xmm0
- movhps [rdi+rcx], xmm0
- movq [rdi+rcx*2], xmm1
- movhps [rdi+rdx], xmm1
-%endif
-
- ; begin epilog
-%ifidn %1, ssse3
- RESTORE_GOT
-%ifndef GET_GOT_SAVE_ARG
- pop rbx
-%endif
-%endif
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-%endmacro
-
-vp9_intra_pred_uv_ho mmx2
-vp9_intra_pred_uv_ho ssse3
-
-SECTION_RODATA
-dc_128:
- times 8 db 128
-dc_4:
- times 4 dw 4
-align 16
-dc_1024:
- times 8 dw 0x400
-align 16
-dc_00001111:
- times 8 db 0
- times 8 db 1
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ /dev/null
@@ -1,101 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/blockd.h"
-
-#define build_intra_predictors_mbuv_prototype(sym) \
- void sym(unsigned char *dst, int dst_stride, \
- const unsigned char *src, int src_stride)
-typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
-
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
-
-static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_stride,
- build_intra_pred_mbuv_fn_t tm_fn,
- build_intra_pred_mbuv_fn_t ho_fn) {
- int mode = xd->mode_info_context->mbmi.uv_mode;
- build_intra_pred_mbuv_fn_t fn;
- int src_stride = xd->dst.uv_stride;
-
- switch (mode) {
- case V_PRED:
- fn = vp9_intra_pred_uv_ve_mmx;
- break;
- case H_PRED:
- fn = ho_fn;
- break;
- case TM_PRED:
- fn = tm_fn;
- break;
- case DC_PRED:
- if (xd->up_available) {
- if (xd->left_available) {
- fn = vp9_intra_pred_uv_dc_mmx2;
- break;
- } else {
- fn = vp9_intra_pred_uv_dctop_mmx2;
- break;
- }
- } else if (xd->left_available) {
- fn = vp9_intra_pred_uv_dcleft_mmx2;
- break;
- } else {
- fn = vp9_intra_pred_uv_dc128_mmx;
- break;
- }
- break;
- default:
- return;
- }
-
- fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);
- fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);
-}
-
-void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
- &xd->predictor[320], 8,
- vp9_intra_pred_uv_tm_sse2,
- vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
- &xd->predictor[320], 8,
- vp9_intra_pred_uv_tm_ssse3,
- vp9_intra_pred_uv_ho_ssse3);
-}
-
-void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
- xd->dst.v_buffer, xd->dst.uv_stride,
- vp9_intra_pred_uv_tm_sse2,
- vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
- xd->dst.v_buffer, xd->dst.uv_stride,
- vp9_intra_pred_uv_tm_ssse3,
- vp9_intra_pred_uv_ho_ssse3);
-}
--- a/vp8/common/x86/sadmxn_x86.c
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> // SSE2
-#include "./vpx_config.h"
-#include "./vpx_rtcd.h"
-
-
-#if CONFIG_NEWBESTREFMV
-
-
-#if HAVE_SSE2
-unsigned int vp9_sad16x3_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- __m128i s0, s1, s2;
- __m128i r0, r1, r2;
- __m128i sad;
-
- (void)max_sad;
-
- s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
- s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
- s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
- r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));
- r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));
- r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));
-
- sad = _mm_sad_epu8(s0, r0);
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1));
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2));
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
-
- return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- int r;
- __m128i s0, s1, s2, s3;
- __m128i r0, r1, r2, r3;
- __m128i sad = _mm_set1_epi16(0);
- for (r = 0; r < 16; r += 4) {
- s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
- s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
- s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
- s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
- r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));
- r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));
- r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));
- r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));
-
- s0 = _mm_unpacklo_epi8(s0, s1);
- r0 = _mm_unpacklo_epi8(r0, r1);
- s2 = _mm_unpacklo_epi8(s2, s3);
- r2 = _mm_unpacklo_epi8(r2, r3);
- s0 = _mm_unpacklo_epi64(s0, s2);
- r0 = _mm_unpacklo_epi64(r0, r2);
-
- // throw out byte 3
- s0 = _mm_slli_epi64(s0, 16);
- r0 = _mm_slli_epi64(r0, 16);
-
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
- src_ptr += src_stride*4;
- ref_ptr += ref_stride*4;
- }
-
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
- return _mm_cvtsi128_si32(sad);
-}
-
-#endif
-
-
-#endif // CONFIG_NEWBESTREFMV
--- a/vp8/common/x86/subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,550 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3)
-sym(vp9_filter_block1d8_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
- lea rbx, [rdx + rdx*4]
- add rbx, rdx ;pitch * 6
-
-.vp9_filter_block1d8_v8_ssse3_loop:
- movq xmm0, [rsi] ;A
- movq xmm1, [rsi + rdx] ;B
- movq xmm2, [rsi + rdx * 2] ;C
- movq xmm3, [rax + rdx * 2] ;D
- movq xmm4, [rsi + rdx * 4] ;E
- movq xmm5, [rax + rdx * 4] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movq xmm6, [rsi + rbx] ;G
- movq xmm7, [rax + rbx] ;H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- punpcklbw xmm6, xmm7 ;G H
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- paddsw xmm4, xmm6
- paddsw xmm0, xmm4
-
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- add rsi, rdx
- add rax, rdx
-
- movq [rdi], xmm0
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v8_ssse3_loop
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3)
-sym(vp9_filter_block1d16_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
- lea rbx, [rdx + rdx*4]
- add rbx, rdx ;pitch * 6
-
-.vp9_filter_block1d16_v8_ssse3_loop:
- movq xmm0, [rsi] ;A
- movq xmm1, [rsi + rdx] ;B
- movq xmm2, [rsi + rdx * 2] ;C
- movq xmm3, [rax + rdx * 2] ;D
- movq xmm4, [rsi + rdx * 4] ;E
- movq xmm5, [rax + rdx * 4] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movq xmm6, [rsi + rbx] ;G
- movq xmm7, [rax + rbx] ;H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- punpcklbw xmm6, xmm7 ;G H
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- paddsw xmm4, xmm6
- paddsw xmm0, xmm4
-
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movq [rdi], xmm0
-
- movq xmm0, [rsi + 8] ;A
- movq xmm1, [rsi + rdx + 8] ;B
- movq xmm2, [rsi + rdx * 2 + 8] ;C
- movq xmm3, [rax + rdx * 2 + 8] ;D
- movq xmm4, [rsi + rdx * 4 + 8] ;E
- movq xmm5, [rax + rdx * 4 + 8] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
-
- movq xmm6, [rsi + rbx + 8] ;G
- movq xmm7, [rax + rbx + 8] ;H
- punpcklbw xmm6, xmm7 ;G H
-
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm2
- paddsw xmm4, xmm6
- paddsw xmm0, krd
- paddsw xmm0, xmm4
-
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- add rsi, rdx
- add rax, rdx
-
- movq [rdi+8], xmm0
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v8_ssse3_loop
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_h8_ssse3)
-sym(vp9_filter_block1d8_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
-; movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
-
-.filter_block1d8_h8_rowloop_ssse3:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-
-; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
- punpcklqdq xmm0, xmm3
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
- paddsw xmm0, xmm4
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- movq [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d8_h8_rowloop_ssse3
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3)
-sym(vp9_filter_block1d16_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
-
-.filter_block1d16_h8_rowloop_ssse3:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-
-; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
- punpcklqdq xmm0, xmm3
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
-
- movq xmm3, [rsi + 5]
-; movq xmm7, [rsi + 12]
- movq xmm7, [rsi + 13]
-;note: same as above
-; punpcklbw xmm3, xmm7
- punpcklqdq xmm3, xmm7
-
- movdqa xmm1, xmm3
- pshufb xmm3, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm3, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm3, xmm1
- paddsw xmm3, xmm2
- paddsw xmm3, krd
- paddsw xmm3, xmm4
- psraw xmm3, 7
- packuswb xmm3, xmm3
- punpcklqdq xmm0, xmm3
-
- lea rsi, [rsi + rax]
- movdqa [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d16_h8_rowloop_ssse3
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-shuf_t0t1:
- db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-align 16
-shuf_t2t3:
- db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-align 16
-shuf_t4t5:
- db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-align 16
-shuf_t6t7:
- db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- a/vp8/common/x86/subpixel_mmx.asm
+++ /dev/null
@@ -1,727 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT 7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx)
-sym(vp9_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
-
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
-%else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
-
- add rsi, r8 ; next line
-%endif
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int output_pitch,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx)
-sym(vp9_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp9_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
-
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-
-.nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
-
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz .nextrow_cv ; next row
-
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict8x8_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_mmx)
-sym(vp9_bilinear_predict8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- shl rax, 5 ; offset * 32
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
-
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
-
- shl rax, 5 ; offset*32
- add rax, rcx ; VFilter
-
- lea rcx, [rdi+rdx*8] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
-
-
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- add rsi, rdx ; next line
-.next_row_8x8:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
-
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
-
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- packuswb mm3, mm4
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8 ;dst_pitch
-%endif
- cmp rdi, rcx ;
- jne .next_row_8x8
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict8x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x4_mmx)
-sym(vp9_bilinear_predict8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
- shl rax, 5
-
- mov rsi, arg(0) ;src_ptr ;
- add rax, rcx
-
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- add rsi, rdx ; next line
-.next_row_8x4:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
-
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
-
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP9_FILTER_SHIFT ;
-
- packuswb mm3, mm4
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8
-%endif
- cmp rdi, rcx ;
- jne .next_row_8x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict4x4_mmx)
-sym(vp9_bilinear_predict4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
- shl rax, 5
-
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
-
- movsxd rdx, dword ptr arg(5) ;ldst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
-
- add rsi, rdx ; next line
-.next_row_4x4:
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
-
- movq mm5, mm7 ;
- punpcklbw mm5, mm0 ;
-
- pmullw mm5, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
-
- packuswb mm7, mm0 ;
-
- pmullw mm3, [rax+16] ;
- paddw mm3, mm5 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb mm3, mm0
- movd [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch ;
- add rsi, rdx ; next line
- add rdi, r8
-%endif
-
- cmp rdi, rcx ;
- jne .next_row_4x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-SECTION_RODATA
-align 16
-rd:
- times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
-
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
-
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
-
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
-
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
-
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
-
-
-align 16
-global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
-sym(vp9_bilinear_filters_8x_mmx):
- times 8 dw 128
- times 8 dw 0
-
- times 8 dw 112
- times 8 dw 16
-
- times 8 dw 96
- times 8 dw 32
-
- times 8 dw 80
- times 8 dw 48
-
- times 8 dw 64
- times 8 dw 64
-
- times 8 dw 48
- times 8 dw 80
-
- times 8 dw 32
- times 8 dw 96
-
- times 8 dw 16
- times 8 dw 112
--- a/vp8/common/x86/subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2)
-sym(vp9_filter_block1d8_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2)
-sym(vp9_filter_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi+16], xmm4
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2)
-sym(vp9_filter_block1d8_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
- movdqa xmm1, XMMWORD PTR [rsi]
- pmullw xmm1, [rax]
-
- movdqa xmm2, XMMWORD PTR [rsi + rdx]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
- pmullw xmm3, [rax + 32]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
- pmullw xmm5, [rax + 64]
-
- add rsi, rdx
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
-
- pmullw xmm4, [rax + 48]
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
-
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-; unsigned short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; const short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2)
-sym(vp9_filter_block1d16_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
- movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
- movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
- pmullw xmm1, [rax + 16]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm3, [rax + 64]
- pmullw xmm4, [rax + 64]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm5, [rax + 32]
- pmullw xmm6, [rax + 32]
-
- movdqa xmm7, XMMWORD PTR [rsi] ; line 1
- movdqa xmm0, XMMWORD PTR [rsi + 16]
- pmullw xmm7, [rax]
- pmullw xmm0, [rax]
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
- paddsw xmm1, xmm7
- paddsw xmm2, xmm0
-
- add rsi, rdx
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm3, [rax + 48]
- pmullw xmm4, [rax + 48]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm5, [rax + 80]
- pmullw xmm6, [rax + 80]
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
- pxor xmm0, xmm0 ; clear xmm0
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
-
- paddsw xmm1, xmm7
- paddsw xmm2, xmm7
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packuswb xmm1, xmm2 ; pack and saturate
- movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d16_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2)
-sym(vp9_filter_block1d8_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
-
- movq QWORD PTR [rdi], xmm4 ; store the results in the destination
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_only_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2)
-sym(vp9_filter_block1d16_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; lower 8 bytes
-
- movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; higher 8 bytes
-
- movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2)
-sym(vp9_filter_block1d8_v6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- mov rax, arg(5) ;vp9_filter
-
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
- movq xmm1, MMWORD PTR [rsi]
- movq xmm2, MMWORD PTR [rsi + rdx]
- movq xmm3, MMWORD PTR [rsi + rdx * 2]
- movq xmm5, MMWORD PTR [rsi + rdx * 4]
- add rsi, rdx
- movq xmm4, MMWORD PTR [rsi + rdx * 2]
- movq xmm6, MMWORD PTR [rsi + rdx * 4]
-
- punpcklbw xmm1, xmm0
- pmullw xmm1, [rax]
-
- punpcklbw xmm2, xmm0
- pmullw xmm2, [rax + 16]
-
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax + 32]
-
- punpcklbw xmm5, xmm0
- pmullw xmm5, [rax + 64]
-
- punpcklbw xmm4, xmm0
- pmullw xmm4, [rax + 48]
-
- punpcklbw xmm6, xmm0
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int output_height,
-; unsigned int output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2)
-sym(vp9_unpack_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(3) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
- movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
- movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- punpcklbw xmm1, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm1
- movdqa XMMWORD Ptr [rdi + 16], xmm3
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(4) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .unpack_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2)
-sym(vp9_bilinear_predict16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
- movsxd rax, dword ptr arg(2) ;xoffset
-
- cmp rax, 0 ;skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
-
- cmp rax, 0 ;skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;dst_pitch
-%endif
- ; get the first horizontal line done
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- add rsi, rdx ; next line
-.next_row:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, [rax]
- pmullw xmm6, [rax]
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- pmullw xmm3, [rax+16]
- pmullw xmm4, [rax+16]
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rdx ; next line
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ;dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- add rsi, rax ; next line
-.next_row_spo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- movdqa xmm4, xmm3 ; make a copy of current line
- movdqa xmm7, xmm3
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm5, xmm1
- pmullw xmm6, xmm1
- pmullw xmm3, xmm2
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ;dst_pitch
- cmp rdi, rcx
- jne .next_row_spo
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.next_row_fpo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ; dst_pitch
- cmp rdi, rcx
- jne .next_row_fpo
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2)
-sym(vp9_bilinear_predict8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ;xoffset
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm5, [rax]
- movdqa xmm6, [rax+16]
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqa xmm3, XMMWORD PTR [rsp]
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- add rsp, 16 ; next line
-.next_row8x8:
- movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
- pmullw xmm7, xmm5
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm4, xmm3
-
- pmullw xmm3, xmm6
- paddw xmm3, xmm7
-
- movdqa xmm7, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
-
- add rsp, 16 ; next line
- add rdi, rdx
-
- cmp rdi, rcx
- jne .next_row8x8
-
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd:
- times 8 dw 0x40
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3)
-sym(vp9_filter_block1d8_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4
-
- movdqa xmm7, [GLOBAL(rd)]
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- mov rdi, arg(2) ;output_ptr
-
- cmp esi, DWORD PTR [rax]
- je vp9_filter_block1d8_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- pmaddubsw xmm1, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
- jnz .filter_block1d8_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp9_filter_block1d8_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
- movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
- mov rsi, arg(0) ;src_ptr
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm2, xmm0
- pshufb xmm0, xmm3
-
- pshufb xmm2, xmm4
- pmaddubsw xmm0, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
-
- jnz .filter_block1d8_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3)
-sym(vp9_filter_block1d16_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- mov rdi, arg(2) ;output_ptr
-
- mov rsi, arg(0) ;src_ptr
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- movq xmm3, MMWORD PTR [rsi + 6]
-
- pmaddubsw xmm1, xmm5
- movq xmm7, MMWORD PTR [rsi + 11]
-
- pmaddubsw xmm2, xmm6
- punpcklbw xmm3, xmm7
-
- paddsw xmm0, xmm1
- movdqa xmm1, xmm3
-
- pmaddubsw xmm3, xmm4
- paddsw xmm0, xmm2
-
- movdqa xmm2, xmm1
- paddsw xmm0, [GLOBAL(rd)]
-
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
-
- psraw xmm0, 7
- pmaddubsw xmm1, xmm5
-
- pmaddubsw xmm2, xmm6
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm1
-
- paddsw xmm3, xmm2
-
- paddsw xmm3, [GLOBAL(rd)]
-
- psraw xmm3, 7
-
- packuswb xmm3, xmm3
-
- punpcklqdq xmm0, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d16_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3)
-sym(vp9_filter_block1d4_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- movdqa xmm7, [GLOBAL(rd)]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf1b)]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2b)]
- pmaddubsw xmm0, xmm4
- pshufb xmm2, [GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- pxor xmm1, xmm1
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movd DWORD PTR [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
- pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
- psraw xmm1, 7
- packuswb xmm1, xmm1
-
- movd DWORD PTR [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3)
-sym(vp9_filter_block1d16_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d16_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2 ;store the results
-
- movq xmm1, MMWORD PTR [rsi + 8] ;A
- movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d16_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- paddsw xmm2, [GLOBAL(rd)]
- paddsw xmm2, xmm3
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- punpcklbw xmm5, xmm4 ;B D
- punpcklbw xmm1, xmm0 ;C E
-
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm5, xmm7
-
- movdqa xmm4, [GLOBAL(rd)]
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm5, xmm1
- paddsw xmm5, xmm4
- psraw xmm5, 7
- packuswb xmm5, xmm5
-
- punpcklqdq xmm2, xmm5
-
- movdqa XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3)
-sym(vp9_filter_block1d8_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d8_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
- movdqa xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d8_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm5, [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm5
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3)
-sym(vp9_filter_block1d4_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_v4_ssse3
-
- movq mm5, MMWORD PTR [rax] ;k0_k5
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
- movd mm1, DWORD PTR [rsi] ;A
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- movd mm0, DWORD PTR [rax + rdx * 4] ;F
-
- movq mm4, [GLOBAL(rd)]
-
- pmaddubsw mm3, mm6
- punpcklbw mm1, mm0 ;A F
- pmaddubsw mm2, mm7
- pmaddubsw mm1, mm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm1
- paddsw mm2, mm4
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_v4_ssse3:
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
- movq mm5, MMWORD PTR [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- pmaddubsw mm3, mm6
- pmaddubsw mm2, mm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm5
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3)
-sym(vp9_bilinear_predict16x16_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(2) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
-
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
-
- movdqa xmm2, [rax]
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ; dst_pitch
-%endif
- movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
-
- punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
- pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm6, xmm5
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm6, xmm1
-
- punpcklbw xmm4, xmm5
- pmaddubsw xmm4, xmm1
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- packuswb xmm6, xmm4
- movdqa xmm5, xmm7
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm2
-
- punpckhbw xmm7, xmm6
- pmaddubsw xmm7, xmm2
-
- paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
- psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm5, xmm7
- movdqa xmm7, xmm6
-
- movdqa [rdi], xmm5 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ; dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
- ; get the first horizontal line done
- movq xmm4, [rsi] ; load row 0
- movq xmm2, [rsi + 8] ; load row 0
-
- lea rsi, [rsi + rax] ; next line
-.next_row_sp:
- movq xmm3, [rsi] ; load row + 1
- movq xmm5, [rsi + 8] ; load row + 1
-
- punpcklbw xmm4, xmm3
- punpcklbw xmm2, xmm5
-
- pmaddubsw xmm4, xmm1
- movq xmm7, [rsi + rax] ; load row + 2
-
- pmaddubsw xmm2, xmm1
- movq xmm6, [rsi + rax + 8] ; load row + 2
-
- punpcklbw xmm3, xmm7
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm3, xmm1
- paddw xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm5, xmm1
- paddw xmm2, [GLOBAL(rd)]
-
- psraw xmm4, VP9_FILTER_SHIFT
- psraw xmm2, VP9_FILTER_SHIFT
-
- packuswb xmm4, xmm2
- paddw xmm3, [GLOBAL(rd)]
-
- movdqa [rdi], xmm4 ; store row 0
- paddw xmm5, [GLOBAL(rd)]
-
- psraw xmm3, VP9_FILTER_SHIFT
- psraw xmm5, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm5
- movdqa xmm4, xmm7
-
- movdqa [rdi + rdx],xmm3 ; store row 1
- lea rsi, [rsi + 2*rax]
-
- movdqa xmm2, xmm6
- lea rdi, [rdi + 2*rdx]
-
- cmp rdi, rcx
- jne .next_row_sp
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
-.next_row_fp:
- movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm2, xmm4
- movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- pmaddubsw xmm2, xmm1
- movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rax] ; next line
- punpcklbw xmm3, xmm4
-
- pmaddubsw xmm3, xmm1
- movq xmm5, [rsi]
-
- paddw xmm2, [GLOBAL(rd)]
- movq xmm7, [rsi+1]
-
- movq xmm6, [rsi+8]
- psraw xmm2, VP9_FILTER_SHIFT
-
- punpcklbw xmm5, xmm7
- movq xmm7, [rsi+9]
-
- paddw xmm3, [GLOBAL(rd)]
- pmaddubsw xmm5, xmm1
-
- psraw xmm3, VP9_FILTER_SHIFT
- punpcklbw xmm6, xmm7
-
- packuswb xmm2, xmm3
- pmaddubsw xmm6, xmm1
-
- movdqa [rdi], xmm2 ; store the results in the destination
- paddw xmm5, [GLOBAL(rd)]
-
- lea rdi, [rdi + rdx] ; dst_pitch
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm6, VP9_FILTER_SHIFT
-
- packuswb xmm5, xmm6
- lea rsi, [rsi + rax] ; next line
-
- movdqa [rdi], xmm5 ; store the results in the destination
- lea rdi, [rdi + rdx] ; dst_pitch
-
- cmp rdi, rcx
-
- jne .next_row_fp
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3)
-sym(vp9_bilinear_predict8x8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ; xoffset
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b8x8_sp_only
-
- shl rax, 4
- add rax, rcx ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b8x8_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm1, [rax]
-
- ; get the first horizontal line done
- movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
- psrldq xmm5, 1
- lea rsp, [rsp + 16] ; next line
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- lea rsp, [rsp + 16] ; next line
-
- movdqa xmm5, xmm6
-
- psrldq xmm5, 1
-
- punpcklbw xmm6, xmm5
- pmaddubsw xmm6, xmm0
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- packuswb xmm6, xmm6
-
- punpcklbw xmm7, xmm6
- pmaddubsw xmm7, xmm1
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm7, xmm7
-
- movq [rdi], xmm7 ; store the results in the destination
- lea rdi, [rdi + rdx]
-
- movdqa xmm7, xmm6
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done8x8
-
-.b8x8_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax] ; VFilter
-
- movq xmm1, XMMWORD PTR [rsp]
- movq xmm2, XMMWORD PTR [rsp+16]
-
- movq xmm3, XMMWORD PTR [rsp+32]
- punpcklbw xmm1, xmm2
-
- movq xmm4, XMMWORD PTR [rsp+48]
- punpcklbw xmm2, xmm3
-
- movq xmm5, XMMWORD PTR [rsp+64]
- punpcklbw xmm3, xmm4
-
- movq xmm6, XMMWORD PTR [rsp+80]
- punpcklbw xmm4, xmm5
-
- movq xmm7, XMMWORD PTR [rsp+96]
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm1, xmm0
- pmaddubsw xmm2, xmm0
-
- pmaddubsw xmm3, xmm0
- pmaddubsw xmm4, xmm0
-
- pmaddubsw xmm5, xmm0
- punpcklbw xmm6, xmm7
-
- pmaddubsw xmm6, xmm0
- paddw xmm1, [GLOBAL(rd)]
-
- paddw xmm2, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm2, VP9_FILTER_SHIFT
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- psraw xmm6, VP9_FILTER_SHIFT
- packuswb xmm1, xmm1
-
- packuswb xmm2, xmm2
- movq [rdi], xmm1
-
- packuswb xmm3, xmm3
- movq [rdi+rdx], xmm2
-
- packuswb xmm4, xmm4
- movq xmm1, XMMWORD PTR [rsp+112]
-
- lea rdi, [rdi + 2*rdx]
- movq xmm2, XMMWORD PTR [rsp+128]
-
- packuswb xmm5, xmm5
- movq [rdi], xmm3
-
- packuswb xmm6, xmm6
- movq [rdi+rdx], xmm4
-
- lea rdi, [rdi + 2*rdx]
- punpcklbw xmm7, xmm1
-
- movq [rdi], xmm5
- pmaddubsw xmm7, xmm0
-
- movq [rdi+rdx], xmm6
- punpcklbw xmm1, xmm2
-
- pmaddubsw xmm1, xmm0
- paddw xmm7, [GLOBAL(rd)]
-
- psraw xmm7, VP9_FILTER_SHIFT
- paddw xmm1, [GLOBAL(rd)]
-
- psraw xmm1, VP9_FILTER_SHIFT
- packuswb xmm7, xmm7
-
- packuswb xmm1, xmm1
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm7
-
- movq [rdi+rdx], xmm1
- lea rsp, [rsp + 144]
-
- jmp .done8x8
-
-.b8x8_fp_only:
- lea rcx, [rdi+rdx*8]
-
-.next_row_fp:
- movdqa xmm1, XMMWORD PTR [rsp]
- movdqa xmm3, XMMWORD PTR [rsp+16]
-
- movdqa xmm2, xmm1
- movdqa xmm5, XMMWORD PTR [rsp+32]
-
- psrldq xmm2, 1
- movdqa xmm7, XMMWORD PTR [rsp+48]
-
- movdqa xmm4, xmm3
- psrldq xmm4, 1
-
- movdqa xmm6, xmm5
- psrldq xmm6, 1
-
- punpcklbw xmm1, xmm2
- pmaddubsw xmm1, xmm0
-
- punpcklbw xmm3, xmm4
- pmaddubsw xmm3, xmm0
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm0
-
- movdqa xmm2, xmm7
- psrldq xmm2, 1
-
- punpcklbw xmm7, xmm2
- pmaddubsw xmm7, xmm0
-
- paddw xmm1, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm7, [GLOBAL(rd)]
- psraw xmm7, VP9_FILTER_SHIFT
-
- packuswb xmm1, xmm1
- packuswb xmm3, xmm3
-
- packuswb xmm5, xmm5
- movq [rdi], xmm1
-
- packuswb xmm7, xmm7
- movq [rdi+rdx], xmm3
-
- lea rdi, [rdi + 2*rdx]
- movq [rdi], xmm5
-
- lea rsp, [rsp + 4*16]
- movq [rdi+rdx], xmm7
-
- lea rdi, [rdi + 2*rdx]
- cmp rdi, rcx
-
- jne .next_row_fp
-
- lea rsp, [rsp + 16]
-
-.done8x8:
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
- db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
- db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
- db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
- db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
- times 8 dw 0x40
-
-align 16
-k0_k5:
- times 8 db 0, 0 ;placeholder
- times 8 db 0, 0
- times 8 db 2, 1
- times 8 db 0, 0
- times 8 db 3, 3
- times 8 db 0, 0
- times 8 db 1, 2
- times 8 db 0, 0
-k1_k3:
- times 8 db 0, 0 ;placeholder
- times 8 db -6, 12
- times 8 db -11, 36
- times 8 db -9, 50
- times 8 db -16, 77
- times 8 db -6, 93
- times 8 db -8, 108
- times 8 db -1, 123
-k2_k4:
- times 8 db 128, 0 ;placeholder
- times 8 db 123, -1
- times 8 db 108, -8
- times 8 db 93, -6
- times 8 db 77, -16
- times 8 db 50, -9
- times 8 db 36, -11
- times 8 db 12, -6
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
-
--- a/vp8/common/x86/subpixel_x86.h
+++ /dev/null
@@ -1,122 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef SUBPIXEL_X86_H
-#define SUBPIXEL_X86_H
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
-
-#undef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
-
-#undef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ /dev/null
@@ -1,602 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/subpixel.h"
-
-extern const short vp9_six_tap_mmx[16][6 * 8];
-
-extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_height,
- unsigned int output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_lin,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
- const short *hfilter, *vfilter;
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 8, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
- 8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
- fdata2 + 8, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
- fdata2 + 12, src_pixels_per_line, 1, 21, 32,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
- 32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 13, 16,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 13, 16,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 4, 8, vfilter);
-}
-
-void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- vp9_bilinear_predict8x8_mmx(src_ptr,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + 8, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + dst_pitch * 8, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 21, 32, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 21, 32);
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 13, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, vfilter);
- }
-}
-#endif
-
-#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- fdata2, 16, 21, xoffset);
- vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
- 16, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 16, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 13, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
- } else {
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 9, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 4, 9, xoffset);
- vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 23, hfilter_aligned16);
- vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
- 16, hfilter_aligned16);
- } else {
- vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 16, vfilter_aligned16);
- }
- }
-}
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 15, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 8, vfilter_aligned16);
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 11, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 4, vfilter_aligned16);
- }
- }
-}
-#endif
--- a/vp8/common/x86/x86_systemdependent.c
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/common/subpixel.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/common/onyxc_int.h"
-
-void vp9_arch_x86_common_init(VP9_COMMON *ctx) {
-#if CONFIG_RUNTIME_CPU_DETECT
- VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
- int flags = x86_simd_caps();
-
- /* Note:
- *
- * This platform can be built without runtime CPU detection as well. If
- * you modify any of the function mappings present in this file, be sure
- * to also update them in static mapings (<arch>/filename_<arch>.h)
- */
-
- /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-// The commented functions need to be re-written for vpx.
- if (flags & HAS_MMX) {
- rtcd->idct.idct1 = vp9_short_idct4x4llm_1_mmx;
- rtcd->idct.idct16 = vp9_short_idct4x4llm_mmx;
- rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;
- // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_mmx;
- // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_mmx;
-
- /* Disabled due to unsupported enhanced interpolation/high_prec mv
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_mmx;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_mmx;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_mmx;
- rtcd->subpix.sixtap4x4 = vp9_sixtap_predict4x4_mmx;
- */
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_mmx;
- rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_mmx;
- rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_mmx;
-
-#if CONFIG_POSTPROC
- rtcd->postproc.down = vp9_mbpost_proc_down_mmx;
- /*rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;*/
- rtcd->postproc.downacross = vp9_post_proc_down_and_across_mmx;
- rtcd->postproc.addnoise = vp9_plane_add_noise_mmx;
-#endif
- }
-
-#endif
-#if HAVE_SSE2
-
- if (flags & HAS_SSE2) {
-
-
- // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_sse2;
-
- /* Disabled due to unsupported enhanced interpolation/high_prec mv
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_sse2;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_sse2;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_sse2;
- */
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_sse2;
-
-#if CONFIG_POSTPROC
- rtcd->postproc.down = vp9_mbpost_proc_down_xmm;
- rtcd->postproc.across = vp9_mbpost_proc_across_ip_xmm;
- rtcd->postproc.downacross = vp9_post_proc_down_and_across_xmm;
- rtcd->postproc.addnoise = vp9_plane_add_noise_wmt;
-#endif
- }
-
-#endif
-
-#if HAVE_SSSE3
-
- if (flags & HAS_SSSE3) {
- /* Disabled due to unsupported enhanced interpolation/high_prec mv
- rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_ssse3;
- rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_ssse3;
- rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_ssse3;
- rtcd->subpix.sixtap4x4 = vp9_sixtap_predict4x4_ssse3;
- rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;
- rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_ssse3;
- */
-
- /* these are disable because of unsupported diagonal pred modes
- rtcd->recon.build_intra_predictors_mbuv =
- vp9_build_intra_predictors_mbuv_ssse3;
- rtcd->recon.build_intra_predictors_mbuv_s =
- vp9_build_intra_predictors_mbuv_s_ssse3;
- */
- }
-#endif
-
-#endif
-}
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ /dev/null
@@ -1,218 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_dc_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-
-;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride, int Dc)
-; r0 = input
-; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch ; +4 = 40
-; sp + 40 = stride ; +4 = 44
-; sp + 44 = Dc ; +4 = 48
-
-
-|vp8_dequant_dc_idct_add_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r6, [sp, #44]
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r3, [sp]
-
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- mov r12, #3
-
-vp8_dequant_dc_add_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne vp8_dequant_dc_add_loop
-
- sub r0, r0, #32
- mov r1, r0
-
-; short_idct4x4llm_v6_dual
- ldr r3, cospi8sqrt2minus1
- ldr r4, sinpi8sqrt2
- ldr r6, [r0, #8]
- mov r5, #2
-vp8_dequant_dc_idct_loop1_v6
- ldr r12, [r0, #24]
- ldr r14, [r0, #16]
- smulwt r9, r3, r6
- smulwb r7, r3, r6
- smulwt r10, r4, r6
- smulwb r8, r4, r6
- pkhbt r7, r7, r9, lsl #16
- smulwt r11, r3, r12
- pkhbt r8, r8, r10, lsl #16
- uadd16 r6, r6, r7
- smulwt r7, r4, r12
- smulwb r9, r3, r12
- smulwb r10, r4, r12
- subs r5, r5, #1
- pkhbt r9, r9, r11, lsl #16
- ldr r11, [r0], #4
- pkhbt r10, r10, r7, lsl #16
- uadd16 r7, r12, r9
- usub16 r7, r8, r7
- uadd16 r6, r6, r10
- uadd16 r10, r11, r14
- usub16 r8, r11, r14
- uadd16 r9, r10, r6
- usub16 r10, r10, r6
- uadd16 r6, r8, r7
- usub16 r7, r8, r7
- str r6, [r1, #8]
- ldrne r6, [r0, #8]
- str r7, [r1, #16]
- str r10, [r1, #24]
- str r9, [r1], #4
- bne vp8_dequant_dc_idct_loop1_v6
-
- mov r5, #2
- sub r0, r1, #8
-vp8_dequant_dc_idct_loop2_v6
- ldr r6, [r0], #4
- ldr r7, [r0], #4
- ldr r8, [r0], #4
- ldr r9, [r0], #4
- smulwt r1, r3, r6
- smulwt r12, r4, r6
- smulwt lr, r3, r8
- smulwt r10, r4, r8
- pkhbt r11, r8, r6, lsl #16
- pkhbt r1, lr, r1, lsl #16
- pkhbt r12, r10, r12, lsl #16
- pkhtb r6, r6, r8, asr #16
- uadd16 r6, r1, r6
- pkhbt lr, r9, r7, lsl #16
- uadd16 r10, r11, lr
- usub16 lr, r11, lr
- pkhtb r8, r7, r9, asr #16
- subs r5, r5, #1
- smulwt r1, r3, r8
- smulwb r7, r3, r8
- smulwt r11, r4, r8
- smulwb r9, r4, r8
- pkhbt r1, r7, r1, lsl #16
- uadd16 r8, r1, r8
- pkhbt r11, r9, r11, lsl #16
- usub16 r1, r12, r8
- uadd16 r8, r11, r6
- ldr r9, c0x00040004
- ldr r12, [sp, #40]
- uadd16 r6, r10, r8
- usub16 r7, r10, r8
- uadd16 r7, r7, r9
- uadd16 r6, r6, r9
- uadd16 r10, r14, r1
- usub16 r1, r14, r1
- uadd16 r10, r10, r9
- uadd16 r1, r1, r9
- ldr r11, [r2], r12
- mov r8, r7, asr #3
- pkhtb r9, r8, r10, asr #19
- mov r8, r1, asr #3
- pkhtb r8, r8, r6, asr #19
- uxtb16 lr, r11, ror #8
- qadd16 r9, r9, lr
- uxtb16 lr, r11
- qadd16 r8, r8, lr
- usat16 r9, #8, r9
- usat16 r8, #8, r8
- orr r9, r8, r9, lsl #8
- ldr r11, [r2], r12
- ldr lr, [sp]
- ldr r12, [sp, #44]
- mov r7, r7, lsl #16
- mov r1, r1, lsl #16
- mov r10, r10, lsl #16
- mov r6, r6, lsl #16
- mov r7, r7, asr #3
- pkhtb r7, r7, r10, asr #19
- mov r1, r1, asr #3
- pkhtb r1, r1, r6, asr #19
- uxtb16 r8, r11, ror #8
- qadd16 r7, r7, r8
- uxtb16 r8, r11
- qadd16 r1, r1, r8
- usat16 r7, #8, r7
- usat16 r1, #8, r1
- orr r1, r1, r7, lsl #8
- str r9, [lr], r12
- str r1, [lr], r12
- str lr, [sp]
- bne vp8_dequant_dc_idct_loop2_v6
-
-; vpx_memset
- sub r0, r0, #32
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_dequant_dc_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2 DCD 0x00008A8C
-c0x00040004 DCD 0x00040004
-
- END
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ /dev/null
@@ -1,196 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
- EXPORT |vp8_dequant_idct_add_v6|
-
- AREA |.text|, CODE, READONLY
-;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride)
-; r0 = input
-; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch ; +4 = 40
-; sp + 40 = stride ; +4 = 44
-
-
-|vp8_dequant_idct_add_v6| PROC
- stmdb sp!, {r4-r11, lr}
-
- ldr r4, [r0] ;input
- ldr r5, [r1], #4 ;dq
-
- sub sp, sp, #4
- str r3, [sp]
-
- mov r12, #4
-
-vp8_dequant_add_loop
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- ldr r4, [r0, #4] ;input
- ldr r5, [r1], #4 ;dq
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- smulbb r6, r4, r5
- smultt r7, r4, r5
-
- subs r12, r12, #1
-
- ldrne r4, [r0, #4]
- ldrne r5, [r1], #4
-
- strh r6, [r0], #2
- strh r7, [r0], #2
-
- bne vp8_dequant_add_loop
-
- sub r0, r0, #32
- mov r1, r0
-
-; short_idct4x4llm_v6_dual
- ldr r3, cospi8sqrt2minus1
- ldr r4, sinpi8sqrt2
- ldr r6, [r0, #8]
- mov r5, #2
-vp8_dequant_idct_loop1_v6
- ldr r12, [r0, #24]
- ldr r14, [r0, #16]
- smulwt r9, r3, r6
- smulwb r7, r3, r6
- smulwt r10, r4, r6
- smulwb r8, r4, r6
- pkhbt r7, r7, r9, lsl #16
- smulwt r11, r3, r12
- pkhbt r8, r8, r10, lsl #16
- uadd16 r6, r6, r7
- smulwt r7, r4, r12
- smulwb r9, r3, r12
- smulwb r10, r4, r12
- subs r5, r5, #1
- pkhbt r9, r9, r11, lsl #16
- ldr r11, [r0], #4
- pkhbt r10, r10, r7, lsl #16
- uadd16 r7, r12, r9
- usub16 r7, r8, r7
- uadd16 r6, r6, r10
- uadd16 r10, r11, r14
- usub16 r8, r11, r14
- uadd16 r9, r10, r6
- usub16 r10, r10, r6
- uadd16 r6, r8, r7
- usub16 r7, r8, r7
- str r6, [r1, #8]
- ldrne r6, [r0, #8]
- str r7, [r1, #16]
- str r10, [r1, #24]
- str r9, [r1], #4
- bne vp8_dequant_idct_loop1_v6
-
- mov r5, #2
- sub r0, r1, #8
-vp8_dequant_idct_loop2_v6
- ldr r6, [r0], #4
- ldr r7, [r0], #4
- ldr r8, [r0], #4
- ldr r9, [r0], #4
- smulwt r1, r3, r6
- smulwt r12, r4, r6
- smulwt lr, r3, r8
- smulwt r10, r4, r8
- pkhbt r11, r8, r6, lsl #16
- pkhbt r1, lr, r1, lsl #16
- pkhbt r12, r10, r12, lsl #16
- pkhtb r6, r6, r8, asr #16
- uadd16 r6, r1, r6
- pkhbt lr, r9, r7, lsl #16
- uadd16 r10, r11, lr
- usub16 lr, r11, lr
- pkhtb r8, r7, r9, asr #16
- subs r5, r5, #1
- smulwt r1, r3, r8
- smulwb r7, r3, r8
- smulwt r11, r4, r8
- smulwb r9, r4, r8
- pkhbt r1, r7, r1, lsl #16
- uadd16 r8, r1, r8
- pkhbt r11, r9, r11, lsl #16
- usub16 r1, r12, r8
- uadd16 r8, r11, r6
- ldr r9, c0x00040004
- ldr r12, [sp, #40]
- uadd16 r6, r10, r8
- usub16 r7, r10, r8
- uadd16 r7, r7, r9
- uadd16 r6, r6, r9
- uadd16 r10, r14, r1
- usub16 r1, r14, r1
- uadd16 r10, r10, r9
- uadd16 r1, r1, r9
- ldr r11, [r2], r12
- mov r8, r7, asr #3
- pkhtb r9, r8, r10, asr #19
- mov r8, r1, asr #3
- pkhtb r8, r8, r6, asr #19
- uxtb16 lr, r11, ror #8
- qadd16 r9, r9, lr
- uxtb16 lr, r11
- qadd16 r8, r8, lr
- usat16 r9, #8, r9
- usat16 r8, #8, r8
- orr r9, r8, r9, lsl #8
- ldr r11, [r2], r12
- ldr lr, [sp]
- ldr r12, [sp, #44]
- mov r7, r7, lsl #16
- mov r1, r1, lsl #16
- mov r10, r10, lsl #16
- mov r6, r6, lsl #16
- mov r7, r7, asr #3
- pkhtb r7, r7, r10, asr #19
- mov r1, r1, asr #3
- pkhtb r1, r1, r6, asr #19
- uxtb16 r8, r11, ror #8
- qadd16 r7, r7, r8
- uxtb16 r8, r11
- qadd16 r1, r1, r8
- usat16 r7, #8, r7
- usat16 r1, #8, r1
- orr r1, r1, r7, lsl #8
- str r9, [lr], r12
- str r1, [lr], r12
- str lr, [sp]
- bne vp8_dequant_idct_loop2_v6
-
-; vpx_memset
- sub r0, r0, #32
- add sp, sp, #4
-
- mov r12, #0
- str r12, [r0]
- str r12, [r0, #4]
- str r12, [r0, #8]
- str r12, [r0, #12]
- str r12, [r0, #16]
- str r12, [r0, #20]
- str r12, [r0, #24]
- str r12, [r0, #28]
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_dequant_idct_add_v6|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x00004E7B
-sinpi8sqrt2 DCD 0x00008A8C
-c0x00040004 DCD 0x00040004
-
- END
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequantize_b_loop_v6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-;-------------------------------
-;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-; r0 short *Q,
-; r1 short *DQC
-; r2 short *DQ
-|vp8_dequantize_b_loop_v6| PROC
- stmdb sp!, {r4-r9, lr}
-
- ldr r3, [r0] ;load Q
- ldr r4, [r1] ;load DQC
- ldr r5, [r0, #4]
- ldr r6, [r1, #4]
-
- mov r12, #2 ;loop counter
-
-dequant_loop
- smulbb r7, r3, r4 ;multiply
- smultt r8, r3, r4
- smulbb r9, r5, r6
- smultt lr, r5, r6
-
- ldr r3, [r0, #8]
- ldr r4, [r1, #8]
- ldr r5, [r0, #12]
- ldr r6, [r1, #12]
-
- strh r7, [r2], #2 ;store result
- smulbb r7, r3, r4 ;multiply
- strh r8, [r2], #2
- smultt r8, r3, r4
- strh r9, [r2], #2
- smulbb r9, r5, r6
- strh lr, [r2], #2
- smultt lr, r5, r6
-
- subs r12, r12, #1
-
- add r0, r0, #16
- add r1, r1, #16
-
- ldrne r3, [r0]
- strh r7, [r2], #2 ;store result
- ldrne r4, [r1]
- strh r8, [r2], #2
- ldrne r5, [r0, #4]
- strh r9, [r2], #2
- ldrne r6, [r1, #4]
- strh lr, [r2], #2
-
- bne dequant_loop
-
- ldmia sp!, {r4-r9, pc}
- ENDP ;|vp8_dequantize_b_loop_v6|
-
- END
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ /dev/null
@@ -1,136 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp8_dequant_dc_idct_add_y_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
-
- if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
- else
- vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
-
- if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
- else
- vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
-
- if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
- else
- vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
-
- q += 64;
- dc += 4;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_y_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
- else {
- vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
- else {
- vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- if (eobs[2] > 1)
- vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
- else {
- vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
- ((int *)(q + 32))[0] = 0;
- }
-
- if (eobs[3] > 1)
- vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
- else {
- vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
- ((int *)(q + 48))[0] = 0;
- }
-
- q += 64;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_uv_block_v6
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 2; i++) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
- else {
- vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
- else {
- vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstu += 4 * stride;
- eobs += 2;
- }
-
- for (i = 0; i < 2; i++) {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
- else {
- vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
- else {
- vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstv += 4 * stride;
- eobs += 2;
- }
-}
--- a/vp8/decoder/arm/dequantize_arm.c
+++ /dev/null
@@ -1,44 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/decoder/dequantize.h"
-#include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_ARMV7
-extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
-#endif
-
-#if HAVE_ARMV6
-extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-#endif
-
-#if HAVE_ARMV7
-
-void vp9_dequantize_b_neon(BLOCKD *d) {
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
- short *DQC = d->dequant;
-
- vp9_dequantize_b_loop_neon(Q, DQC, DQ);
-}
-#endif
-
-#if HAVE_ARMV6
-void vp9_dequantize_b_v6(BLOCKD *d) {
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
- short *DQC = d->dequant;
-
- vp9_dequantize_b_loop_v6(Q, DQC, DQ);
-}
-#endif
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ /dev/null
@@ -1,129 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequant_idct_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride)
-; r0 short *input,
-; r1 short *dq,
-; r2 unsigned char *pred
-; r3 unsigned char *dest
-; sp int pitch
-; sp+4 int stride
-
-|vp8_dequant_idct_add_neon| PROC
- vld1.16 {q3, q4}, [r0]
- vld1.16 {q5, q6}, [r1]
- ldr r1, [sp] ; pitch
- vld1.32 {d14[0]}, [r2], r1
- vld1.32 {d14[1]}, [r2], r1
- vld1.32 {d15[0]}, [r2], r1
- vld1.32 {d15[1]}, [r2]
-
- ldr r1, [sp, #4] ; stride
-
- adr r12, cospi8sqrt2minus1 ; pointer to the first constant
-
- vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
- vmul.i16 q2, q4, q6
-
-;|short_idct4x4llm_neon| PROC
- vld1.16 {d0}, [r12]
- vswp d3, d4 ;q2(vp[4] vp[12])
-
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
-; memset(input, 0, 32) -- 32bytes
- vmov.i16 q14, #0
-
- vswp d3, d4
- vqdmulh.s16 q3, q2, d0[2]
- vqdmulh.s16 q4, q2, d0[0]
-
- vqadd.s16 d12, d2, d3 ;a1
- vqsub.s16 d13, d2, d3 ;b1
-
- vmov q15, q14
-
- vshr.s16 q3, q3, #1
- vshr.s16 q4, q4, #1
-
- vqadd.s16 q3, q3, q2
- vqadd.s16 q4, q4, q2
-
- vqsub.s16 d10, d6, d9 ;c1
- vqadd.s16 d11, d7, d8 ;d1
-
- vqadd.s16 d2, d12, d11
- vqadd.s16 d3, d13, d10
- vqsub.s16 d4, d13, d10
- vqsub.s16 d5, d12, d11
-
- vst1.16 {q14, q15}, [r0]
-
- vrshr.s16 d2, d2, #3
- vrshr.s16 d3, d3, #3
- vrshr.s16 d4, d4, #3
- vrshr.s16 d5, d5, #3
-
- vtrn.32 d2, d4
- vtrn.32 d3, d5
- vtrn.16 d2, d3
- vtrn.16 d4, d5
-
- vaddw.u8 q1, q1, d14
- vaddw.u8 q2, q2, d15
-
- vqmovun.s16 d0, q1
- vqmovun.s16 d1, q2
-
- vst1.32 {d0[0]}, [r3], r1
- vst1.32 {d0[1]}, [r3], r1
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r3]
-
- bx lr
-
- ENDP ; |vp8_dequant_idct_add_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2 DCD 0x8a8c8a8c
-
- END
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ /dev/null
@@ -1,34 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_dequantize_b_loop_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 short *Q,
-; r1 short *DQC
-; r2 short *DQ
-|vp8_dequantize_b_loop_neon| PROC
- vld1.16 {q0, q1}, [r0]
- vld1.16 {q2, q3}, [r1]
-
- vmul.i16 q4, q0, q2
- vmul.i16 q5, q1, q3
-
- vst1.16 {q4, q5}, [r2]
-
- bx lr
-
- ENDP
-
- END
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ /dev/null
@@ -1,110 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-/* place these declarations here because we don't want to maintain them
- * outside of this scope
- */
-void idct_dequant_dc_full_2x_neon
-(short *input, short *dq, unsigned char *pre, unsigned char *dst,
- int stride, short *dc);
-void idct_dequant_dc_0_2x_neon
-(short *dc, unsigned char *pre, unsigned char *dst, int stride);
-void idct_dequant_full_2x_neon
-(short *q, short *dq, unsigned char *pre, unsigned char *dst,
- int pitch, int stride);
-void idct_dequant_0_2x_neon
-(short *q, short dq, unsigned char *pre, int pitch,
- unsigned char *dst, int stride);
-
-void vp8_dequant_dc_idct_add_y_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
- else
- idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
-
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
- else
- idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
-
- q += 64;
- dc += 4;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_y_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
- else
- idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
-
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
- else
- idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
-
- q += 64;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_uv_block_neon
-(short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
- else
- idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
-
- q += 32;
- pre += 32;
- dstu += 4 * stride;
-
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
- else
- idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
-
- q += 32;
- pre += 32;
-
- if (((short *)eobs)[2] & 0xfefe)
- idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
- else
- idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
-
- q += 32;
- pre += 32;
- dstv += 4 * stride;
-
- if (((short *)eobs)[3] & 0xfefe)
- idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
- else
- idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
-}
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,79 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
-; int pitch, unsigned char *dst, int stride);
-; r0 *q
-; r1 dq
-; r2 *pre
-; r3 pitch
-; sp *dst
-; sp+4 stride
-|idct_dequant_0_2x_neon| PROC
- add r12, r2, #4
- vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d4[1]}, [r2]
- vld1.32 {d8[0]}, [r12], r3
- vld1.32 {d8[1]}, [r12], r3
- vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d10[1]}, [r12]
-
- ldrh r12, [r0] ; lo q
- ldrh r2, [r0, #32] ; hi q
- mov r3, #0
- strh r3, [r0]
- strh r3, [r0, #32]
-
- sxth r12, r12 ; lo
- mul r0, r12, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q0, r0
- sxth r2, r2 ; hi
- mul r0, r2, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- ldr r2, [sp] ; dst
- ldr r3, [sp, #4] ; stride
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- add r0, r2, #4
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d10[1]}, [r0]
-
- bx lr
-
- ENDP ; |idct_dequant_0_2x_neon|
- END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ /dev/null
@@ -1,69 +1,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
-; unsigned char *dst, int stride);
-; r0 *dc
-; r1 *pre
-; r2 *dst
-; r3 stride
-|idct_dequant_dc_0_2x_neon| PROC
- ldr r0, [r0] ; *dc
- mov r12, #16
-
- vld1.32 {d2[0]}, [r1], r12 ; lo
- vld1.32 {d2[1]}, [r1], r12
- vld1.32 {d4[0]}, [r1], r12
- vld1.32 {d4[1]}, [r1]
- sub r1, r1, #44
- vld1.32 {d8[0]}, [r1], r12 ; hi
- vld1.32 {d8[1]}, [r1], r12
- vld1.32 {d10[0]}, [r1], r12
- vld1.32 {d10[1]}, [r1]
-
- sxth r1, r0 ; lo *dc
- add r1, r1, #4
- asr r1, r1, #3
- vdup.16 q0, r1
- sxth r0, r0, ror #16 ; hi *dc
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- add r0, r2, #4
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d10[1]}, [r0]
-
- bx lr
-
- ENDP ;|idct_dequant_dc_0_2x_neon|
- END
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ /dev/null
@@ -1,205 +1,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_dc_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
-; unsigned char *dst, int stride, short *dc);
-; r0 *q,
-; r1 *dq,
-; r2 *pre
-; r3 *dst
-; sp stride
-; sp+4 *dc
-|idct_dequant_dc_full_2x_neon| PROC
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- mov r1, #16 ; pitch
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r1 ; l pre
- vld1.32 {d28[1]}, [r12], r1 ; r pre
- vld1.32 {d29[0]}, [r2], r1
- vld1.32 {d29[1]}, [r12], r1
- vld1.32 {d30[0]}, [r2], r1
- vld1.32 {d30[1]}, [r12], r1
- vld1.32 {d31[0]}, [r2]
- ldr r1, [sp, #4]
- vld1.32 {d31[1]}, [r12]
-
- adr r2, cospi8sqrt2minus1 ; pointer to the first constant
-
- ldrh r12, [r1], #2 ; lo *dc
- ldrh r1, [r1] ; hi *dc
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- ; move dc up to neon and overwrite first element
- vmov.16 d4[0], r12
- vmov.16 d8[0], r1
-
- vld1.16 {d0}, [r2]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- ldr r1, [sp] ; stride
- add r2, r3, #4 ; hi
- vst1.32 {d0[0]}, [r3], r1 ; lo
- vst1.32 {d0[1]}, [r2], r1 ; hi
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r2], r1
- vst1.32 {d2[0]}, [r3], r1
- vst1.32 {d2[1]}, [r2], r1
- vst1.32 {d3[0]}, [r3]
- vst1.32 {d3[1]}, [r2]
-
- bx lr
-
- ENDP ; |idct_dequant_dc_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,197 +1,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
-; unsigned char *dst, int pitch, int stride);
-; r0 *q,
-; r1 *dq,
-; r2 *pre
-; r3 *dst
-; sp pitch
-; sp+4 stride
-|idct_dequant_full_2x_neon| PROC
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- ldr r1, [sp] ; pitch
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r1 ; l pre
- vld1.32 {d28[1]}, [r12], r1 ; r pre
- vld1.32 {d29[0]}, [r2], r1
- vld1.32 {d29[1]}, [r12], r1
- vld1.32 {d30[0]}, [r2], r1
- vld1.32 {d30[1]}, [r12], r1
- vld1.32 {d31[0]}, [r2]
- vld1.32 {d31[1]}, [r12]
-
- adr r2, cospi8sqrt2minus1 ; pointer to the first constant
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- vld1.16 {d0}, [r2]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- ldr r1, [sp, #4] ; stride
- add r2, r3, #4 ; hi
- vst1.32 {d0[0]}, [r3], r1 ; lo
- vst1.32 {d0[1]}, [r2], r1 ; hi
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r2], r1
- vst1.32 {d2[0]}, [r3], r1
- vst1.32 {d2[1]}, [r2], r1
- vst1.32 {d3[0]}, [r3]
- vst1.32 {d3[1]}, [r2]
-
- bx lr
-
- ENDP ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
--- a/vp8/decoder/asm_dec_offsets.c
+++ /dev/null
@@ -1,39 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "onyxd_int.h"
-
-BEGIN
-
-DEFINE(detok_scan, offsetof(DETOK, scan));
-DEFINE(detok_ptr_block2leftabove, offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_coef_tree_ptr, offsetof(DETOK, vp9_coef_tree_ptr));
-DEFINE(detok_norm_ptr, offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_coef_bands_x, offsetof(DETOK, ptr_coef_bands_x));
-
-DEFINE(detok_A, offsetof(DETOK, A));
-DEFINE(detok_L, offsetof(DETOK, L));
-
-DEFINE(detok_qcoeff_start_ptr, offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_coef_probs, offsetof(DETOK, coef_probs));
-DEFINE(detok_eob, offsetof(DETOK, eob));
-
-DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
--- a/vp8/decoder/dboolhuff.c
+++ /dev/null
@@ -1,100 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "dboolhuff.h"
-#include "vpx_ports/mem.h"
-#include "vpx_mem/vpx_mem.h"
-
-int vp9_start_decode(BOOL_DECODER *br,
- const unsigned char *source,
- unsigned int source_sz) {
- br->user_buffer_end = source + source_sz;
- br->user_buffer = source;
- br->value = 0;
- br->count = -8;
- br->range = 255;
-
- if (source_sz && !source)
- return 1;
-
- /* Populate the buffer */
- vp9_bool_decoder_fill(br);
-
- return 0;
-}
-
-
-void vp9_bool_decoder_fill(BOOL_DECODER *br) {
- const unsigned char *bufptr;
- const unsigned char *bufend;
- VP9_BD_VALUE value;
- int count;
- bufend = br->user_buffer_end;
- bufptr = br->user_buffer;
- value = br->value;
- count = br->count;
-
- VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
-
- br->user_buffer = bufptr;
- br->value = value;
- br->count = count;
-}
-
-
-static int get_unsigned_bits(unsigned num_values) {
- int cat = 0;
- if ((num_values--) <= 1) return 0;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
-int vp9_inv_recenter_nonneg(int v, int m) {
- if (v > (m << 1)) return v;
- else if ((v & 1) == 0) return (v >> 1) + m;
- else return m - ((v + 1) >> 1);
-}
-
-int vp9_decode_uniform(BOOL_DECODER *br, int n) {
- int v;
- int l = get_unsigned_bits(n);
- int m = (1 << l) - n;
- if (!l) return 0;
- v = decode_value(br, l - 1);
- if (v < m)
- return v;
- else
- return (v << 1) - m + decode_value(br, 1);
-}
-
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {
- int i = 0, mk = 0, word;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- word = vp9_decode_uniform(br, num_syms - mk) + mk;
- break;
- } else {
- if (decode_value(br, 1)) {
- i++;
- mk += a;
- } else {
- word = decode_value(br, b) + mk;
- break;
- }
- }
- }
- return word;
-}
--- a/vp8/decoder/dboolhuff.h
+++ /dev/null
@@ -1,153 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DBOOLHUFF_H
-#define DBOOLHUFF_H
-#include <stddef.h>
-#include <limits.h>
-#include "vpx_ports/config.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-typedef size_t VP9_BD_VALUE;
-
-# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
-/*This is meant to be a large, positive constant that can still be efficiently
- loaded as an immediate (on platforms like ARM, for example).
- Even relatively modest values like 100 would work fine.*/
-# define VP9_LOTS_OF_BITS (0x40000000)
-
-typedef struct {
- const unsigned char *user_buffer_end;
- const unsigned char *user_buffer;
- VP9_BD_VALUE value;
- int count;
- unsigned int range;
-} BOOL_DECODER;
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-int vp9_start_decode(BOOL_DECODER *br,
- const unsigned char *source,
- unsigned int source_sz);
-
-void vp9_bool_decoder_fill(BOOL_DECODER *br);
-
-int vp9_decode_uniform(BOOL_DECODER *br, int n);
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
-int vp9_inv_recenter_nonneg(int v, int m);
-
-/*The refill loop is used in several places, so define it in a macro to make
- sure they're all consistent.
- An inline function would be cleaner, but has a significant penalty, because
- multiple BOOL_DECODER fields must be modified, and the compiler is not smart
- enough to eliminate the stores to those fields and the subsequent reloads
- from them when inlining the function.*/
-#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
- do \
- { \
- int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \
- int loop_end, x; \
- size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
- \
- x = shift + CHAR_BIT - bits_left; \
- loop_end = 0; \
- if(x >= 0) \
- { \
- (_count) += VP9_LOTS_OF_BITS; \
- loop_end = x; \
- if(!bits_left) break; \
- } \
- while(shift >= loop_end) \
- { \
- (_count) += CHAR_BIT; \
- (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \
- shift -= CHAR_BIT; \
- } \
- } \
- while(0) \
-
-
-static int decode_bool(BOOL_DECODER *br, int probability) {
- unsigned int bit = 0;
- VP9_BD_VALUE value;
- unsigned int split;
- VP9_BD_VALUE bigsplit;
- int count;
- unsigned int range;
-
- split = 1 + (((br->range - 1) * probability) >> 8);
-
- if (br->count < 0)
- vp9_bool_decoder_fill(br);
-
- value = br->value;
- count = br->count;
-
- bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
-
- range = split;
-
- if (value >= bigsplit) {
- range = br->range - split;
- value = value - bigsplit;
- bit = 1;
- }
-
- {
- register unsigned int shift = vp9_norm[range];
- range <<= shift;
- value <<= shift;
- count -= shift;
- }
- br->value = value;
- br->count = count;
- br->range = range;
-
- return bit;
-}
-
-static int decode_value(BOOL_DECODER *br, int bits) {
- int z = 0;
- int bit;
-
- for (bit = bits - 1; bit >= 0; bit--) {
- z |= (decode_bool(br, 0x80) << bit);
- }
-
- return z;
-}
-
-static int bool_error(BOOL_DECODER *br) {
- /* Check if we have reached the end of the buffer.
- *
- * Variable 'count' stores the number of bits in the 'value' buffer, minus
- * 8. The top byte is part of the algorithm, and the remainder is buffered
- * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
- * occupied, 8 for the algorithm and 8 in the buffer.
- *
- * When reading a byte from the user's buffer, count is filled with 8 and
- * one byte is filled into the value buffer. When we reach the end of the
- * data, count is additionally filled with VP9_LOTS_OF_BITS. So when
- * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
- */
- if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {
- /* We have tried to decode bits after the end of
- * stream was encountered.
- */
- return 1;
- }
-
- /* No error. */
- return 0;
-}
-
-#endif
--- a/vp8/decoder/decodemv.c
+++ /dev/null
@@ -1,1199 +1,0 @@
-/*
- Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "treereader.h"
-#include "vp8/common/entropymv.h"
-#include "vp8/common/entropymode.h"
-#include "onyxd_int.h"
-#include "vp8/common/findnearmv.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vp8/decoder/decodemv.h"
-#if CONFIG_DEBUG
-#include <assert.h>
-#endif
-
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
-
-static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_bmode_tree, p);
-}
-
-static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_ymode_tree, p);
-}
-
-#if CONFIG_SUPERBLOCKS
-static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_uv_mode_tree, p);
-}
-#endif
-
-static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_kf_ymode_tree, p);
-}
-
-static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_i8x8_mode_tree, p);
-}
-
-static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
- return treed_read(bc, vp9_uv_mode_tree, p);
-}
-
-// This function reads the current macro block's segnent id from the bitstream
-// It should only be called if a segment map update is indicated.
-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,
- MACROBLOCKD *xd) {
- /* Is segmentation enabled */
- if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
- /* If so then read the segment id. */
- if (vp9_read(r, xd->mb_segment_tree_probs[0]))
- mi->segment_id =
- (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));
- else
- mi->segment_id =
- (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));
- }
-}
-
-#if CONFIG_NEW_MVREF
-int vp9_read_mv_ref_id(vp9_reader *r,
- vp9_prob * ref_id_probs) {
- int ref_index = 0;
-
- if (vp9_read(r, ref_id_probs[0])) {
- ref_index++;
- if (vp9_read(r, ref_id_probs[1])) {
- ref_index++;
- if (vp9_read(r, ref_id_probs[2]))
- ref_index++;
- }
- }
- return ref_index;
-}
-#endif
-
-extern const int vp9_i8x8_block[4];
-static void kfread_modes(VP9D_COMP *pbi,
- MODE_INFO *m,
- int mb_row,
- int mb_col,
- BOOL_DECODER* const bc) {
- VP9_COMMON *const cm = &pbi->common;
- const int mis = pbi->common.mode_info_stride;
- int map_index = mb_row * pbi->common.mb_cols + mb_col;
- MB_PREDICTION_MODE y_mode;
-
- // Read the Macroblock segmentation map if it is being updated explicitly
- // this frame (reset to 0 by default).
- m->mbmi.segment_id = 0;
- if (pbi->mb.update_mb_segmentation_map) {
- read_mb_segid(bc, &m->mbmi, &pbi->mb);
- pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
- }
-
- m->mbmi.mb_skip_coeff = 0;
- if (pbi->common.mb_no_coeff_skip &&
- (!vp9_segfeature_active(&pbi->mb,
- m->mbmi.segment_id, SEG_LVL_EOB) ||
- (vp9_get_segdata(&pbi->mb,
- m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
- MACROBLOCKD *const xd = &pbi->mb;
- m->mbmi.mb_skip_coeff =
- vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- } else {
- if (vp9_segfeature_active(&pbi->mb,
- m->mbmi.segment_id, SEG_LVL_EOB) &&
- (vp9_get_segdata(&pbi->mb,
- m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {
- m->mbmi.mb_skip_coeff = 1;
- } else
- m->mbmi.mb_skip_coeff = 0;
- }
-
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb) {
- y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
- pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
- } else
-#endif
- y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
- pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-#if CONFIG_COMP_INTRA_PRED
- m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
- m->mbmi.ref_frame = INTRA_FRAME;
-
- if ((m->mbmi.mode = y_mode) == B_PRED) {
- int i = 0;
-#if CONFIG_COMP_INTRA_PRED
- int use_comp_pred = vp9_read(bc, 128);
-#endif
- do {
- const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
- const B_PREDICTION_MODE L = left_block_mode(m, i);
-
- m->bmi[i].as_mode.first =
- (B_PREDICTION_MODE) read_bmode(
- bc, pbi->common.kf_bmode_prob [A] [L]);
-#if CONFIG_COMP_INTRA_PRED
- if (use_comp_pred) {
- m->bmi[i].as_mode.second =
- (B_PREDICTION_MODE) read_bmode(
- bc, pbi->common.kf_bmode_prob [A] [L]);
- } else {
- m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
- }
-#endif
- } while (++i < 16);
- }
- if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
- int i;
- int mode8x8;
- for (i = 0; i < 4; i++) {
- int ib = vp9_i8x8_block[i];
- mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
- m->bmi[ib + 0].as_mode.first = mode8x8;
- m->bmi[ib + 1].as_mode.first = mode8x8;
- m->bmi[ib + 4].as_mode.first = mode8x8;
- m->bmi[ib + 5].as_mode.first = mode8x8;
-#if CONFIG_COMP_INTRA_PRED
- m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
- }
- } else
- m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
- pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
-#if CONFIG_COMP_INTRA_PRED
- m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb)
- m->mbmi.txfm_size = TX_8X8;
- else
-#endif
- if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
- m->mbmi.mode <= I8X8_PRED) {
- // FIXME(rbultje) code ternary symbol once all experiments are merged
- m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
- if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
- m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
- } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
- m->mbmi.txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
- m->mbmi.txfm_size = TX_8X8;
- } else {
- m->mbmi.txfm_size = TX_4X4;
- }
-}
-
-static int read_nmv_component(vp9_reader *r,
- int rv,
- const nmv_component *mvcomp) {
- int v, s, z, c, o, d;
- s = vp9_read(r, mvcomp->sign);
- c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
- if (c == MV_CLASS_0) {
- d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
- } else {
- int i, b;
- d = 0;
- b = c + CLASS0_BITS - 1; /* number of bits */
- for (i = 0; i < b; ++i)
- d |= (vp9_read(r, mvcomp->bits[i]) << i);
- }
- o = d << 3;
-
- z = vp9_get_mv_mag(c, o);
- v = (s ? -(z + 8) : (z + 8));
- return v;
-}
-
-static int read_nmv_component_fp(vp9_reader *r,
- int v,
- int rv,
- const nmv_component *mvcomp,
- int usehp) {
- int s, z, c, o, d, e, f;
- s = v < 0;
- z = (s ? -v : v) - 1; /* magnitude - 1 */
- z &= ~7;
-
- c = vp9_get_mv_class(z, &o);
- d = o >> 3;
-
- if (c == MV_CLASS_0) {
- f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);
- } else {
- f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
- }
- o += (f << 1);
-
- if (usehp) {
- if (c == MV_CLASS_0) {
- e = vp9_read(r, mvcomp->class0_hp);
- } else {
- e = vp9_read(r, mvcomp->hp);
- }
- o += e;
- } else {
- ++o; /* Note if hp is not used, the default value of the hp bit is 1 */
- }
- z = vp9_get_mv_mag(c, o);
- v = (s ? -(z + 1) : (z + 1));
- return v;
-}
-
-static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
- const nmv_context *mvctx) {
- MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
- mv->row = mv-> col = 0;
- if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
- mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
- }
- if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
- mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
- }
-}
-
-static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
- const nmv_context *mvctx, int usehp) {
- MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
- usehp = usehp && vp9_use_nmv_hp(ref);
- if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
- mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
- usehp);
- }
- if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
- mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
- usehp);
- }
- //printf(" %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);
-}
-
-static void update_nmv(vp9_reader *bc, vp9_prob *const p,
- const vp9_prob upd_p) {
- if (vp9_read(bc, upd_p)) {
-#ifdef LOW_PRECISION_MV_UPDATE
- *p = (vp9_read_literal(bc, 7) << 1) | 1;
-#else
- *p = (vp9_read_literal(bc, 8));
-#endif
- }
-}
-
-static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
- int usehp) {
- int i, j, k;
-#ifdef MV_GROUP_UPDATE
- if (!vp9_read_bit(bc)) return;
-#endif
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- update_nmv(bc, &mvctx->joints[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (i = 0; i < 2; ++i) {
- update_nmv(bc, &mvctx->comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- update_nmv(bc, &mvctx->comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- update_nmv(bc, &mvctx->comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- update_nmv(bc, &mvctx->comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
-
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- for (k = 0; k < 3; ++k)
- update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < 3; ++j) {
- update_nmv(bc, &mvctx->comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
-
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- update_nmv(bc, &mvctx->comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- update_nmv(bc, &mvctx->comps[i].hp,
- VP9_NMV_UPDATE_PROB);
- }
- }
-}
-
-// Read the referncence frame
-static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
- vp9_reader *const bc,
- unsigned char segment_id) {
- MV_REFERENCE_FRAME ref_frame;
- int seg_ref_active;
- int seg_ref_count = 0;
-
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
-
- seg_ref_active = vp9_segfeature_active(xd,
- segment_id,
- SEG_LVL_REF_FRAME);
-
- // If segment coding enabled does the segment allow for more than one
- // possible reference frame
- if (seg_ref_active) {
- seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
- vp9_check_segref(xd, segment_id, LAST_FRAME) +
- vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
- vp9_check_segref(xd, segment_id, ALTREF_FRAME);
- }
-
- // Segment reference frame features not available or allows for
- // multiple reference frame options
- if (!seg_ref_active || (seg_ref_count > 1)) {
- // Values used in prediction model coding
- unsigned char prediction_flag;
- vp9_prob pred_prob;
- MV_REFERENCE_FRAME pred_ref;
-
- // Get the context probability the prediction flag
- pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
- // Read the prediction status flag
- prediction_flag = (unsigned char)vp9_read(bc, pred_prob);
-
- // Store the prediction flag.
- vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
-
- // Get the predicted reference frame.
- pred_ref = vp9_get_pred_ref(cm, xd);
-
- // If correctly predicted then use the predicted value
- if (prediction_flag) {
- ref_frame = pred_ref;
- }
- // else decode the explicitly coded value
- else {
- vp9_prob mod_refprobs[PREDICTION_PROBS];
- vpx_memcpy(mod_refprobs,
- cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));
-
- // If segment coding enabled blank out options that cant occur by
- // setting the branch probability to 0.
- if (seg_ref_active) {
- mod_refprobs[INTRA_FRAME] *=
- vp9_check_segref(xd, segment_id, INTRA_FRAME);
- mod_refprobs[LAST_FRAME] *=
- vp9_check_segref(xd, segment_id, LAST_FRAME);
- mod_refprobs[GOLDEN_FRAME] *=
- (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
- vp9_check_segref(xd, segment_id, ALTREF_FRAME));
- }
-
- // Default to INTRA_FRAME (value 0)
- ref_frame = INTRA_FRAME;
-
- // Do we need to decode the Intra/Inter branch
- if (mod_refprobs[0])
- ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);
- else
- ref_frame++;
-
- if (ref_frame) {
- // Do we need to decode the Last/Gf_Arf branch
- if (mod_refprobs[1])
- ref_frame += vp9_read(bc, mod_refprobs[1]);
- else
- ref_frame++;
-
- if (ref_frame > 1) {
- // Do we need to decode the GF/Arf branch
- if (mod_refprobs[2])
- ref_frame += vp9_read(bc, mod_refprobs[2]);
- else {
- if (seg_ref_active) {
- if ((pred_ref == GOLDEN_FRAME) ||
- !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {
- ref_frame = ALTREF_FRAME;
- } else
- ref_frame = GOLDEN_FRAME;
- } else
- ref_frame = (pred_ref == GOLDEN_FRAME)
- ? ALTREF_FRAME : GOLDEN_FRAME;
- }
- }
- }
- }
- }
-
- // Segment reference frame features are enabled
- else {
- // The reference frame for the mb is considered as correclty predicted
- // if it is signaled at the segment level for the purposes of the
- // common prediction model
- vp9_set_pred_flag(xd, PRED_REF, 1);
- ref_frame = vp9_get_pred_ref(cm, xd);
- }
-
- return (MV_REFERENCE_FRAME)ref_frame;
-}
-
-#if CONFIG_SUPERBLOCKS
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
- return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
-}
-#endif
-
-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
- return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
-}
-
-static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {
- return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);
-}
-
-#ifdef VPX_MODE_COUNT
-unsigned int vp9_mv_cont_count[5][4] = {
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 }
-};
-#endif
-
-static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
-static const unsigned char mbsplit_fill_offset[4][16] = {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
- { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
-};
-
-static void read_switchable_interp_probs(VP9D_COMP* const pbi,
- BOOL_DECODER* const bc) {
- VP9_COMMON *const cm = &pbi->common;
- int i, j;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
- for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
- cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);
- }
- }
- //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
- //cm->fc.switchable_interp_prob[1]);
-}
-
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
- VP9_COMMON *const cm = &pbi->common;
- nmv_context *const nmvc = &pbi->common.fc.nmvc;
- MACROBLOCKD *const xd = &pbi->mb;
-
- if (cm->frame_type == KEY_FRAME) {
- if (!cm->kf_ymode_probs_update)
- cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
- } else {
-#if CONFIG_PRED_FILTER
- cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
-
- if (cm->pred_filter_mode == 2)
- cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
-#endif
- if (cm->mcomp_filter_type == SWITCHABLE)
- read_switchable_interp_probs(pbi, bc);
- // Decode the baseline probabilities for decoding reference frame
- cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);
- cm->prob_last_coded = (vp9_prob)vp9_read_literal(bc, 8);
- cm->prob_gf_coded = (vp9_prob)vp9_read_literal(bc, 8);
-
- // Computes a modified set of probabilities for use when reference
- // frame prediction fails.
- vp9_compute_mod_refprobs(cm);
-
- pbi->common.comp_pred_mode = vp9_read(bc, 128);
- if (cm->comp_pred_mode)
- cm->comp_pred_mode += vp9_read(bc, 128);
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
- int i;
- for (i = 0; i < COMP_PRED_CONTEXTS; i++)
- cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);
- }
-
- if (vp9_read_bit(bc)) {
- int i = 0;
-
- do {
- cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
- } while (++i < VP9_YMODES - 1);
- }
-
-#if CONFIG_NEW_MVREF
- // Temp defaults probabilities for ecnoding the MV ref id signal
- vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif
-
- read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
- }
-}
-
-// This function either reads the segment id for the current macroblock from
-// the bitstream or if the value is temporally predicted asserts the predicted
-// value
-static void read_mb_segment_id(VP9D_COMP *pbi,
- int mb_row, int mb_col,
- BOOL_DECODER* const bc) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *mbmi = &mi->mbmi;
- int index = mb_row * pbi->common.mb_cols + mb_col;
-
- if (xd->segmentation_enabled) {
- if (xd->update_mb_segmentation_map) {
- // Is temporal coding of the segment id for this mb enabled.
- if (cm->temporal_update) {
- // Get the context based probability for reading the
- // prediction status flag
- vp9_prob pred_prob =
- vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
-
- // Read the prediction status flag
- unsigned char seg_pred_flag =
- (unsigned char)vp9_read(bc, pred_prob);
-
- // Store the prediction flag.
- vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);
-
- // If the value is flagged as correctly predicted
- // then use the predicted value
- if (seg_pred_flag) {
- mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);
- }
- // Else .... decode it explicitly
- else {
- read_mb_segid(bc, mbmi, xd);
- }
- }
- // Normal unpredicted coding mode
- else {
- read_mb_segid(bc, mbmi, xd);
- }
-#if CONFIG_SUPERBLOCKS
- if (mbmi->encoded_as_sb) {
- cm->last_frame_seg_map[index] = mbmi->segment_id;
- if (mb_col + 1 < cm->mb_cols)
- cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
- if (mb_row + 1 < cm->mb_rows) {
- cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
- if (mb_col + 1 < cm->mb_cols)
- cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
- }
- } else
-#endif
- {
- cm->last_frame_seg_map[index] = mbmi->segment_id;
- }
- } else {
-#if CONFIG_SUPERBLOCKS
- if (mbmi->encoded_as_sb) {
- mbmi->segment_id = cm->last_frame_seg_map[index];
- if (mb_col < cm->mb_cols - 1)
- mbmi->segment_id = mbmi->segment_id &&
- cm->last_frame_seg_map[index + 1];
- if (mb_row < cm->mb_rows - 1) {
- mbmi->segment_id = mbmi->segment_id &&
- cm->last_frame_seg_map[index + cm->mb_cols];
- if (mb_col < cm->mb_cols - 1)
- mbmi->segment_id = mbmi->segment_id &&
- cm->last_frame_seg_map[index + cm->mb_cols + 1];
- }
- } else
-#endif
- {
- mbmi->segment_id = cm->last_frame_seg_map[index];
- }
- }
- } else {
- // The encoder explicitly sets the segment_id to 0
- // when segmentation is disabled
- mbmi->segment_id = 0;
- }
-}
-
-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
- MODE_INFO *prev_mi,
- int mb_row, int mb_col,
- BOOL_DECODER* const bc) {
- VP9_COMMON *const cm = &pbi->common;
- nmv_context *const nmvc = &pbi->common.fc.nmvc;
- const int mis = pbi->common.mode_info_stride;
- MACROBLOCKD *const xd = &pbi->mb;
-
- int_mv *const mv = &mbmi->mv;
- int mb_to_left_edge;
- int mb_to_right_edge;
- int mb_to_top_edge;
- int mb_to_bottom_edge;
-
- mb_to_top_edge = xd->mb_to_top_edge;
- mb_to_bottom_edge = xd->mb_to_bottom_edge;
- mb_to_top_edge -= LEFT_TOP_MARGIN;
- mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
- mbmi->need_to_clamp_mvs = 0;
- mbmi->need_to_clamp_secondmv = 0;
- mbmi->second_ref_frame = 0;
- /* Distance of Mb to the various image edges.
- * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
- */
- xd->mb_to_left_edge =
- mb_to_left_edge = -((mb_col * 16) << 3);
- mb_to_left_edge -= LEFT_TOP_MARGIN;
-
- xd->mb_to_right_edge =
- mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
- mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
-
- // Make sure the MACROBLOCKD mode info pointer is pointed at the
- // correct entry for the current macroblock.
- xd->mode_info_context = mi;
- xd->prev_mode_info_context = prev_mi;
-
- // Read the macroblock segment id.
- read_mb_segment_id(pbi, mb_row, mb_col, bc);
-
- if (pbi->common.mb_no_coeff_skip &&
- (!vp9_segfeature_active(xd,
- mbmi->segment_id, SEG_LVL_EOB) ||
- (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {
- // Read the macroblock coeff skip flag if this feature is in use,
- // else default to 0
- mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- } else {
- if (vp9_segfeature_active(xd,
- mbmi->segment_id, SEG_LVL_EOB) &&
- (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {
- mbmi->mb_skip_coeff = 1;
- } else
- mbmi->mb_skip_coeff = 0;
- }
-
- // Read the reference frame
- mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
-
- // If reference frame is an Inter frame
- if (mbmi->ref_frame) {
- int rct[4];
- int_mv nearest, nearby, best_mv;
- int_mv nearest_second, nearby_second, best_mv_second;
- vp9_prob mv_ref_p [VP9_MVREFS - 1];
-
-#if CONFIG_NEWBESTREFMV
- int recon_y_stride, recon_yoffset;
- int recon_uv_stride, recon_uvoffset;
-#endif
-
- vp9_find_near_mvs(xd, mi,
- prev_mi,
- &nearest, &nearby, &best_mv, rct,
- mbmi->ref_frame, cm->ref_frame_sign_bias);
-
-#if CONFIG_NEWBESTREFMV
- {
- int ref_fb_idx;
- MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
- /* Select the appropriate reference frame for this MB */
- if (ref_frame == LAST_FRAME)
- ref_fb_idx = cm->lst_fb_idx;
- else if (ref_frame == GOLDEN_FRAME)
- ref_fb_idx = cm->gld_fb_idx;
- else
- ref_fb_idx = cm->alt_fb_idx;
-
- recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride ;
- recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-
- recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
- xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
- xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
- xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
- vp9_find_mv_refs(xd, mi, prev_mi,
- ref_frame, mbmi->ref_mvs[ref_frame],
- cm->ref_frame_sign_bias);
-
- vp9_find_best_ref_mvs(xd,
- xd->pre.y_buffer,
- recon_y_stride,
- mbmi->ref_mvs[ref_frame],
- &best_mv, &nearest, &nearby);
- }
-#endif
-
- vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);
-
- // Is the segment level mode feature enabled for this segment
- if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
- mbmi->mode =
- vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
- } else {
-#if CONFIG_SUPERBLOCKS
- if (mbmi->encoded_as_sb) {
- mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
- } else
-#endif
- mbmi->mode = read_mv_ref(bc, mv_ref_p);
-
- vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);
- }
-
-#if CONFIG_PRED_FILTER
- if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
- // Is the prediction filter enabled
- if (cm->pred_filter_mode == 2)
- mbmi->pred_filter_enabled =
- vp9_read(bc, cm->prob_pred_filter_off);
- else
- mbmi->pred_filter_enabled = cm->pred_filter_mode;
- }
-#endif
- if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
- {
- if (cm->mcomp_filter_type == SWITCHABLE) {
- mbmi->interp_filter = vp9_switchable_interp[
- treed_read(bc, vp9_switchable_interp_tree,
- vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];
- } else {
- mbmi->interp_filter = cm->mcomp_filter_type;
- }
- }
-
- if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
- (cm->comp_pred_mode == HYBRID_PREDICTION &&
- vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {
- /* Since we have 3 reference frames, we can only have 3 unique
- * combinations of combinations of 2 different reference frames
- * (A-G, G-L or A-L). In the bitstream, we use this to simply
- * derive the second reference frame from the first reference
- * frame, by saying it's the next one in the enumerator, and
- * if that's > n_refs, then the second reference frame is the
- * first one in the enumerator. */
- mbmi->second_ref_frame = mbmi->ref_frame + 1;
- if (mbmi->second_ref_frame == 4)
- mbmi->second_ref_frame = 1;
-#if CONFIG_NEWBESTREFMV
- if (mbmi->second_ref_frame) {
- int second_ref_fb_idx;
- /* Select the appropriate reference frame for this MB */
- if (mbmi->second_ref_frame == LAST_FRAME)
- second_ref_fb_idx = cm->lst_fb_idx;
- else if (mbmi->second_ref_frame ==
- GOLDEN_FRAME)
- second_ref_fb_idx = cm->gld_fb_idx;
- else
- second_ref_fb_idx = cm->alt_fb_idx;
-
- xd->second_pre.y_buffer =
- cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
- xd->second_pre.u_buffer =
- cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
- xd->second_pre.v_buffer =
- cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
- vp9_find_near_mvs(xd, mi, prev_mi,
- &nearest_second, &nearby_second, &best_mv_second,
- rct,
- mbmi->second_ref_frame,
- cm->ref_frame_sign_bias);
-
- vp9_find_mv_refs(xd, mi, prev_mi,
- mbmi->second_ref_frame,
- mbmi->ref_mvs[mbmi->second_ref_frame],
- cm->ref_frame_sign_bias);
-
- vp9_find_best_ref_mvs(xd,
- xd->second_pre.y_buffer,
- recon_y_stride,
- mbmi->ref_mvs[mbmi->second_ref_frame],
- &best_mv_second,
- &nearest_second,
- &nearby_second);
- }
-#else
- vp9_find_near_mvs(xd, mi, prev_mi,
- &nearest_second, &nearby_second, &best_mv_second,
- rct,
- mbmi->second_ref_frame,
- pbi->common.ref_frame_sign_bias);
-#endif
- } else {
- mbmi->second_ref_frame = 0;
- }
-
- mbmi->uv_mode = DC_PRED;
- switch (mbmi->mode) {
- case SPLITMV: {
- const int s = mbmi->partitioning =
- treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
- const int num_p = vp9_mbsplit_count [s];
- int j = 0;
- cm->fc.mbsplit_counts[s]++;
-
- mbmi->need_to_clamp_mvs = 0;
- do { /* for each subset j */
- int_mv leftmv, abovemv, second_leftmv, second_abovemv;
- int_mv blockmv, secondmv;
- int k; /* first block in subset j */
- int mv_contz;
- int blockmode;
-
- k = vp9_mbsplit_offset[s][j];
-
- leftmv.as_int = left_block_mv(mi, k);
- abovemv.as_int = above_block_mv(mi, k, mis);
- if (mbmi->second_ref_frame) {
- second_leftmv.as_int = left_block_second_mv(mi, k);
- second_abovemv.as_int = above_block_second_mv(mi, k, mis);
- }
- mv_contz = vp9_mv_cont(&leftmv, &abovemv);
- blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);
- cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
-
- switch (blockmode) {
- case NEW4X4:
- read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);
- read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,
- xd->allow_high_precision_mv);
- vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
- blockmv.as_mv.row += best_mv.as_mv.row;
- blockmv.as_mv.col += best_mv.as_mv.col;
-
- if (mbmi->second_ref_frame) {
- read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);
- read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
- xd->allow_high_precision_mv);
- vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
- secondmv.as_mv.row += best_mv_second.as_mv.row;
- secondmv.as_mv.col += best_mv_second.as_mv.col;
- }
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][3]++;
-#endif
- break;
- case LEFT4X4:
- blockmv.as_int = leftmv.as_int;
- if (mbmi->second_ref_frame)
- secondmv.as_int = second_leftmv.as_int;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][0]++;
-#endif
- break;
- case ABOVE4X4:
- blockmv.as_int = abovemv.as_int;
- if (mbmi->second_ref_frame)
- secondmv.as_int = second_abovemv.as_int;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][1]++;
-#endif
- break;
- case ZERO4X4:
- blockmv.as_int = 0;
- if (mbmi->second_ref_frame)
- secondmv.as_int = 0;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][2]++;
-#endif
- break;
- default:
- break;
- }
-
- mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
- mb_to_left_edge,
- mb_to_right_edge,
- mb_to_top_edge,
- mb_to_bottom_edge);
- if (mbmi->second_ref_frame) {
- mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
- mb_to_left_edge,
- mb_to_right_edge,
- mb_to_top_edge,
- mb_to_bottom_edge);
- }
-
- {
- /* Fill (uniform) modes, mvs of jth subset.
- Must do it here because ensuing subsets can
- refer back to us via "left" or "above". */
- const unsigned char *fill_offset;
- unsigned int fill_count = mbsplit_fill_count[s];
-
- fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
-
- do {
- mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
- if (mbmi->second_ref_frame)
- mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
- fill_offset++;
- } while (--fill_count);
- }
-
- } while (++j < num_p);
- }
-
- mv->as_int = mi->bmi[15].as_mv.first.as_int;
- mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
-
- break; /* done with SPLITMV */
-
- case NEARMV:
- mv->as_int = nearby.as_int;
- /* Clip "next_nearest" so that it does not extend to far out of image */
- clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
- mb_to_top_edge, mb_to_bottom_edge);
- if (mbmi->second_ref_frame) {
- mbmi->mv[1].as_int = nearby_second.as_int;
- clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
- mb_to_top_edge, mb_to_bottom_edge);
- }
- break;
-
- case NEARESTMV:
- mv->as_int = nearest.as_int;
- /* Clip "next_nearest" so that it does not extend to far out of image */
- clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
- mb_to_top_edge, mb_to_bottom_edge);
- if (mbmi->second_ref_frame) {
- mbmi->mv[1].as_int = nearest_second.as_int;
- clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
- mb_to_top_edge, mb_to_bottom_edge);
- }
- break;
-
- case ZEROMV:
- mv->as_int = 0;
- if (mbmi->second_ref_frame)
- mbmi->mv[1].as_int = 0;
- break;
-
- case NEWMV:
-
-#if CONFIG_NEW_MVREF
- {
- int best_index;
- MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
- // Encode the index of the choice.
- best_index =
- vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-
- best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
- }
-#endif
-
- read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
- read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
- xd->allow_high_precision_mv);
- vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,
- xd->allow_high_precision_mv);
-
- mv->as_mv.row += best_mv.as_mv.row;
- mv->as_mv.col += best_mv.as_mv.col;
-
- /* Don't need to check this on NEARMV and NEARESTMV modes
- * since those modes clamp the MV. The NEWMV mode does not,
- * so signal to the prediction stage whether special
- * handling may be required.
- */
- mbmi->need_to_clamp_mvs = check_mv_bounds(mv,
- mb_to_left_edge,
- mb_to_right_edge,
- mb_to_top_edge,
- mb_to_bottom_edge);
-
- if (mbmi->second_ref_frame) {
-#if CONFIG_NEW_MVREF
- {
- int best_index;
- MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
-
- // Encode the index of the choice.
- best_index =
- vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
- best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
- }
-#endif
-
- read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
- read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
- xd->allow_high_precision_mv);
- vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
- mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;
- mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;
- mbmi->need_to_clamp_secondmv |=
- check_mv_bounds(&mbmi->mv[1],
- mb_to_left_edge, mb_to_right_edge,
- mb_to_top_edge, mb_to_bottom_edge);
- }
- break;
- default:
-;
-#if CONFIG_DEBUG
- assert(0);
-#endif
- }
- } else {
- /* required for left and above block mv */
- mbmi->mv[0].as_int = 0;
-
- if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))
- mbmi->mode = (MB_PREDICTION_MODE)
- vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
- else {
- // FIXME write using SB mode tree
- mbmi->mode = (MB_PREDICTION_MODE)
- read_ymode(bc, pbi->common.fc.ymode_prob);
- pbi->common.fc.ymode_counts[mbmi->mode]++;
- }
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
- // If MB mode is BPRED read the block modes
- if (mbmi->mode == B_PRED) {
- int j = 0;
-#if CONFIG_COMP_INTRA_PRED
- int use_comp_pred = vp9_read(bc, 128);
-#endif
- do {
- mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
- /*
- {
- int p;
- for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
- printf(" %d", pbi->common.fc.bmode_prob[p]);
- printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);
- }
- */
- pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;
-#if CONFIG_COMP_INTRA_PRED
- if (use_comp_pred) {
- mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
- } else {
- mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
- }
-#endif
- } while (++j < 16);
- }
-
- if (mbmi->mode == I8X8_PRED) {
- int i;
- int mode8x8;
- for (i = 0; i < 4; i++) {
- int ib = vp9_i8x8_block[i];
- mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
- mi->bmi[ib + 0].as_mode.first = mode8x8;
- mi->bmi[ib + 1].as_mode.first = mode8x8;
- mi->bmi[ib + 4].as_mode.first = mode8x8;
- mi->bmi[ib + 5].as_mode.first = mode8x8;
- pbi->common.fc.i8x8_mode_counts[mode8x8]++;
-#if CONFIG_COMP_INTRA_PRED
- mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
- mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
- }
- } else {
- mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
- bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
- pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
- }
-
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
- }
-
-#if CONFIG_SUPERBLOCKS
- if (mbmi->encoded_as_sb)
- mbmi->txfm_size = TX_8X8;
- else
-#endif
- if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
- ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
- (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
- mbmi->partitioning == PARTITIONING_4X4)))) {
- // FIXME(rbultje) code ternary symbol once all experiments are merged
- mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
- if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
- mbmi->mode != SPLITMV)
- mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
- } else if (cm->txfm_mode >= ALLOW_16X16 &&
- ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
- (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
- mbmi->txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 &&
- (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&
- !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
- mbmi->partitioning == PARTITIONING_4X4))) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
-}
-
-void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
- VP9_COMMON *cm = &pbi->common;
-
- vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
- if (pbi->common.mb_no_coeff_skip) {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
- }
-
- mb_mode_mv_init(pbi, bc);
-}
-void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,
- MACROBLOCKD *xd,
- int mb_row,
- int mb_col,
- BOOL_DECODER* const bc) {
- MODE_INFO *mi = xd->mode_info_context;
- MODE_INFO *prev_mi = xd->prev_mode_info_context;
-
- if (pbi->common.frame_type == KEY_FRAME)
- kfread_modes(pbi, mi, mb_row, mb_col, bc);
- else
- read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
-}
--- a/vp8/decoder/decodemv.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
- MACROBLOCKD* const xd,
- int mb_row,
- int mb_col,
- BOOL_DECODER* const bc);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
--- a/vp8/decoder/decodframe.c
+++ /dev/null
@@ -1,1337 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyxd_int.h"
-#include "vp8/common/header.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/reconinter.h"
-#include "detokenize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
-#include "vpx_scale/yv12extend.h"
-#include "vp8/common/setupintrarecon.h"
-
-#include "decodemv.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/modecont.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/idct.h"
-#include "dboolhuff.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/entropy.h"
-#include "vpx_rtcd.h"
-
-#include <assert.h>
-#include <stdio.h>
-
-
-#define COEFCOUNT_TESTING
-
-static int merge_index(int v, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (v < max1) v = v * modulus + modulus / 2;
- else {
- int w;
- v -= max1;
- w = v;
- v += (v + modulus - modulus / 2) / modulus;
- while (v % modulus == modulus / 2 ||
- w != v - (v + modulus - modulus / 2) / modulus) v++;
- }
- return v;
-}
-
-static int inv_remap_prob(int v, int m) {
- const int n = 256;
- const int modulus = MODULUS_PARAM;
- int i;
- v = merge_index(v, n - 1, modulus);
- if ((m << 1) <= n) {
- i = vp9_inv_recenter_nonneg(v + 1, m);
- } else {
- i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
- }
- return i;
-}
-
-static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
- int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);
- return (vp9_prob)inv_remap_prob(delp, oldp);
-}
-
-void vp9_init_de_quantizer(VP9D_COMP *pbi) {
- int i;
- int Q;
- VP9_COMMON *const pc = &pbi->common;
-
- for (Q = 0; Q < QINDEX_RANGE; Q++) {
- pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
- pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
- pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
-
- /* all the ac values =; */
- for (i = 1; i < 16; i++) {
- int rc = vp9_default_zig_zag1d[i];
-
- pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
- pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
- pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
- }
- }
-}
-
-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
- int i;
- int QIndex;
- VP9_COMMON *const pc = &pbi->common;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
-
- // Set the Q baseline allowing for any segment level adjustment
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
- /* Abs Value */
- if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
- QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
- /* Delta Value */
- else {
- QIndex = pc->base_qindex +
- vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
- QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
- }
- } else
- QIndex = pc->base_qindex;
- xd->q_index = QIndex;
-
- /* Set up the block level dequant pointers */
- for (i = 0; i < 16; i++) {
- xd->block[i].dequant = pc->Y1dequant[QIndex];
- }
-
-#if CONFIG_LOSSLESS
- if (!QIndex) {
- pbi->common.rtcd.idct.idct1 = vp9_short_inv_walsh4x4_1_x8_c;
- pbi->common.rtcd.idct.idct16 = vp9_short_inv_walsh4x4_x8_c;
- pbi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_inv_walsh_add_c;
- pbi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_lossless_c;
- pbi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_lossless_c;
- pbi->idct_add = vp9_dequant_idct_add_lossless_c;
- pbi->dc_idct_add = vp9_dequant_dc_idct_add_lossless_c;
- pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
- pbi->idct_add_y_block = vp9_dequant_idct_add_y_block_lossless_c;
- pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;
- } else {
- pbi->common.rtcd.idct.idct1 = vp9_short_idct4x4llm_1_c;
- pbi->common.rtcd.idct.idct16 = vp9_short_idct4x4llm_c;
- pbi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
- pbi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
- pbi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
- pbi->idct_add = vp9_dequant_idct_add;
- pbi->dc_idct_add = vp9_dequant_dc_idct_add;
- pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
- pbi->idct_add_y_block = vp9_dequant_idct_add_y_block;
- pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block;
- }
-#else
- pbi->idct_add = vp9_dequant_idct_add;
- pbi->dc_idct_add = vp9_dequant_dc_idct_add;
- pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
- pbi->idct_add_y_block = vp9_dequant_idct_add_y_block;
- pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block;
-#endif
-
- for (i = 16; i < 24; i++) {
- xd->block[i].dequant = pc->UVdequant[QIndex];
- }
-
- xd->block[24].dequant = pc->Y2dequant[QIndex];
-
-}
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
-#else
-#define RTCD_VTABLE(x) NULL
-#endif
-
-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
- if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- vp9_build_intra_predictors_sbuv_s(xd);
- vp9_build_intra_predictors_sby_s(xd);
- } else {
-#endif
- vp9_build_intra_predictors_mbuv_s(xd);
- vp9_build_intra_predictors_mby_s(xd);
-#if CONFIG_SUPERBLOCKS
- }
-#endif
- } else {
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
- } else {
-#endif
- vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
- }
-#if CONFIG_SUPERBLOCKS
- }
-#endif
- }
-}
-
-static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
- int mb_row, unsigned int mb_col,
- BOOL_DECODER* const bc) {
- int eobtotal = 0;
- MB_PREDICTION_MODE mode;
- int i;
- int tx_size;
- TX_TYPE tx_type;
- VP9_COMMON *pc = &pbi->common;
-#if CONFIG_SUPERBLOCKS
- int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
-#endif
-
- // re-initialize macroblock dequantizer before detokenization
- if (xd->segmentation_enabled)
- mb_init_dequantizer(pbi, xd);
-
- tx_size = xd->mode_info_context->mbmi.txfm_size;
- mode = xd->mode_info_context->mbmi.mode;
-
- if (xd->mode_info_context->mbmi.mb_skip_coeff) {
- vp9_reset_mb_tokens_context(xd);
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb &&
- (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
- if (mb_col < pc->mb_cols - 1)
- xd->above_context++;
- if (mb_row < pc->mb_rows - 1)
- xd->left_context++;
- vp9_reset_mb_tokens_context(xd);
- if (mb_col < pc->mb_cols - 1)
- xd->above_context--;
- if (mb_row < pc->mb_rows - 1)
- xd->left_context--;
- }
-#endif
- } else if (!bool_error(bc)) {
- for (i = 0; i < 25; i++) {
- xd->block[i].eob = 0;
- xd->eobs[i] = 0;
- }
- if (tx_size == TX_16X16) {
- eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
- } else if (tx_size == TX_8X8) {
- eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
- } else {
- eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
- }
- }
-
- //mode = xd->mode_info_context->mbmi.mode;
- if (pbi->common.frame_type != KEY_FRAME)
- vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
- &pbi->common);
-
- if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
- && mode != I8X8_PRED
- && !bool_error(bc)) {
- /* Special case: Force the loopfilter to skip when eobtotal and
- * mb_skip_coeff are zero.
- * */
- xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-#if CONFIG_SUPERBLOCKS
- if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
-#endif
- {
- skip_recon_mb(pbi, xd);
- return;
- }
- }
-
- // moved to be performed before detokenization
-// if (xd->segmentation_enabled)
-// mb_init_dequantizer(pbi, xd);
-
- /* do prediction */
- if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- vp9_build_intra_predictors_sby_s(xd);
- vp9_build_intra_predictors_sbuv_s(xd);
- } else
-#endif
- if (mode != I8X8_PRED) {
- vp9_build_intra_predictors_mbuv(xd);
- if (mode != B_PRED) {
- vp9_build_intra_predictors_mby(xd);
- }
- }
- } else {
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
- } else
-#endif
- vp9_build_inter_predictors_mb(xd);
- }
-
- /* dequantization and idct */
- if (mode == I8X8_PRED) {
- for (i = 0; i < 4; i++) {
- int ib = vp9_i8x8_block[i];
- const int iblock[4] = {0, 1, 4, 5};
- int j;
- int i8x8mode;
- BLOCKD *b;
-
- int idx = (ib & 0x02) ? (ib + 2) : ib;
-
- short *q = xd->block[idx].qcoeff;
- short *dq = xd->block[0].dequant;
- unsigned char *pre = xd->block[ib].predictor;
- unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
- int stride = xd->dst.y_stride;
-
- b = &xd->block[ib];
- i8x8mode = b->bmi.as_mode.first;
- vp9_intra8x8_predict(b, i8x8mode, b->predictor);
-
- if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
- tx_type = get_tx_type(xd, &xd->block[idx]);
- if (tx_type != DCT_DCT) {
- vp9_ht_dequant_idct_add_8x8_c(tx_type,
- q, dq, pre, dst, 16, stride);
- } else {
- vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
- }
- q += 64;
- } else {
- for (j = 0; j < 4; j++) {
- b = &xd->block[ib + iblock[j]];
- vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
- }
- }
- b = &xd->block[16 + i];
- vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
- pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 8, b->dst_stride);
- b = &xd->block[20 + i];
- vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
- pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 8, b->dst_stride);
- }
- } else if (mode == B_PRED) {
- for (i = 0; i < 16; i++) {
- BLOCKD *b = &xd->block[i];
- int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
-
- if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
- vp9_intra4x4_predict(b, b_mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
- }
-#endif
-
- tx_type = get_tx_type(xd, b);
- if (tx_type != DCT_DCT) {
- vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
- b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
- } else {
- vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
- }
- }
- } else if (mode == SPLITMV) {
- if (tx_size == TX_8X8) {
- vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd);
- } else {
- pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs);
- }
- } else {
- BLOCKD *b = &xd->block[24];
-
- if (tx_size == TX_16X16) {
- BLOCKD *bd = &xd->block[0];
- tx_type = get_tx_type(xd, bd);
- if (tx_type != DCT_DCT) {
- vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
- xd->block[0].dequant, xd->predictor,
- xd->dst.y_buffer, 16, xd->dst.y_stride);
- } else {
- vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
- 16, xd->dst.y_stride);
- }
- } else if (tx_size == TX_8X8) {
-#if CONFIG_SUPERBLOCKS
- void *orig = xd->mode_info_context;
- int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
- for (n = 0; n < num; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
- if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
- mb_row + y_idx >= pc->mb_rows))
- continue;
-
- if (n != 0) {
- for (i = 0; i < 25; i++) {
- xd->block[i].eob = 0;
- xd->eobs[i] = 0;
- }
- xd->above_context = pc->above_context + mb_col + (n & 1);
- xd->left_context = pc->left_context + (n >> 1);
- xd->mode_info_context = orig;
- xd->mode_info_context += (n & 1);
- xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
- if (!orig_skip_flag) {
- eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
- if (eobtotal == 0) // skip loopfilter
- xd->mode_info_context->mbmi.mb_skip_coeff = 1;
- } else {
- vp9_reset_mb_tokens_context(xd);
- }
- }
-
- if (xd->mode_info_context->mbmi.mb_skip_coeff)
- continue; // only happens for SBs, which are already in dest buffer
-#endif
- vp9_dequantize_b_2x2(b);
- IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
- ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
- xd->block[0].dequant,
- xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
- // do UV inline also
- vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
- xd->block[16].dequant,
- xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
- xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
- xd->dst.uv_stride, xd->eobs + 16, xd);
- } else
-#endif
- vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
- xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-#if CONFIG_SUPERBLOCKS
- }
- xd->mode_info_context = orig;
-#endif
- } else {
- vp9_dequantize_b(b);
- if (xd->eobs[24] > 1) {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- } else {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- }
-
- pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,
- xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,
- xd->block[24].diff);
- }
- }
-
-#if CONFIG_SUPERBLOCKS
- if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
- if ((tx_size == TX_8X8 &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV)
- || tx_size == TX_16X16
- )
- vp9_dequant_idct_add_uv_block_8x8
- (xd->qcoeff + 16 * 16, xd->block[16].dequant,
- xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.uv_stride, xd->eobs + 16, xd); //
- else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
- pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
- xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.uv_stride, xd->eobs + 16);
-#if CONFIG_SUPERBLOCKS
- }
-#endif
-}
-
-
-static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
- int ret_val = 0;
-
- if (vp9_read_bit(bc)) {
- ret_val = vp9_read_literal(bc, 4);
-
- if (vp9_read_bit(bc))
- ret_val = -ret_val;
- }
-
- /* Trigger a quantizer update if the delta-q value has changed */
- if (ret_val != prev)
- *q_update = 1;
-
- return ret_val;
-}
-
-#ifdef PACKET_TESTING
-#include <stdio.h>
-FILE *vpxlog = 0;
-#endif
-
-/* Decode a row of Superblocks (2x2 region of MBs) */
-static void
-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
- BOOL_DECODER* const bc) {
- int i;
- int sb_col;
- int mb_row, mb_col;
- int recon_yoffset, recon_uvoffset;
- int ref_fb_idx = pc->lst_fb_idx;
- int dst_fb_idx = pc->new_fb_idx;
- int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
- int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
- int row_delta[4] = { 0, +1, 0, -1};
- int col_delta[4] = { +1, -1, +1, +1};
- int sb_cols = (pc->mb_cols + 1) >> 1;
-
- // For a SB there are 2 left contexts, each pertaining to a MB row within
- vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
-
- mb_row = mbrow;
- mb_col = 0;
-
- for (sb_col = 0; sb_col < sb_cols; sb_col++) {
- MODE_INFO *mi = xd->mode_info_context;
-
-#if CONFIG_SUPERBLOCKS
- mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
-#endif
-
- // Process the 4 MBs within the SB in the order:
- // top-left, top-right, bottom-left, bottom-right
- for (i = 0; i < 4; i++) {
- int dy = row_delta[i];
- int dx = col_delta[i];
- int offset_extended = dy * xd->mode_info_stride + dx;
-
- xd->mb_index = i;
-
- mi = xd->mode_info_context;
- if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
- // MB lies outside frame, skip on to next
- mb_row += dy;
- mb_col += dx;
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
- continue;
- }
-
- // Set above context pointer
- xd->above_context = pc->above_context + mb_col;
- xd->left_context = pc->left_context + (i >> 1);
-
- /* Distance of Mb to the various image edges.
- * These are specified to 8th pel as they are always compared to
- * values that are in 1/8th pel units
- */
- xd->mb_to_top_edge = -((mb_row * 16)) << 3;
- xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-
- xd->up_available = (mb_row != 0);
- xd->left_available = (mb_col != 0);
-
-
- recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
- xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
- xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
- xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-#if CONFIG_SUPERBLOCKS
- if (i)
- mi->mbmi.encoded_as_sb = 0;
-#endif
- vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
-
- update_blockd_bmi(xd);
-
- /* Select the appropriate reference frame for this MB */
- if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
- ref_fb_idx = pc->lst_fb_idx;
- else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
- ref_fb_idx = pc->gld_fb_idx;
- else
- ref_fb_idx = pc->alt_fb_idx;
-
- xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
- xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
- xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- int second_ref_fb_idx;
-
- /* Select the appropriate reference frame for this MB */
- if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
- second_ref_fb_idx = pc->lst_fb_idx;
- else if (xd->mode_info_context->mbmi.second_ref_frame ==
- GOLDEN_FRAME)
- second_ref_fb_idx = pc->gld_fb_idx;
- else
- second_ref_fb_idx = pc->alt_fb_idx;
-
- xd->second_pre.y_buffer =
- pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
- xd->second_pre.u_buffer =
- pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
- xd->second_pre.v_buffer =
- pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
- }
-
- if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
- /* propagate errors from reference frames */
- xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
- }
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- if (mb_col < pc->mb_cols - 1)
- mi[1] = mi[0];
- if (mb_row < pc->mb_rows - 1) {
- mi[pc->mode_info_stride] = mi[0];
- if (mb_col < pc->mb_cols - 1)
- mi[pc->mode_info_stride + 1] = mi[0];
- }
- }
-#endif
- vp9_intra_prediction_down_copy(xd);
- decode_macroblock(pbi, xd, mb_row, mb_col, bc);
-
- /* check if the boolean decoder has suffered an error */
- xd->corrupted |= bool_error(bc);
-
-#if CONFIG_SUPERBLOCKS
- if (mi->mbmi.encoded_as_sb) {
- assert(!i);
- mb_col += 2;
- xd->mode_info_context += 2;
- xd->prev_mode_info_context += 2;
- break;
- }
-#endif
-
- // skip to next MB
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
- mb_row += dy;
- mb_col += dx;
- }
- }
-
- /* skip prediction column */
- xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
- xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-}
-
-static unsigned int read_partition_size(const unsigned char *cx_size) {
- const unsigned int size =
- cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
- return size;
-}
-
-static int read_is_valid(const unsigned char *start,
- size_t len,
- const unsigned char *end) {
- return (start + len > start && start + len <= end);
-}
-
-
-static void setup_token_decoder(VP9D_COMP *pbi,
- const unsigned char *cx_data,
- BOOL_DECODER* const bool_decoder) {
- VP9_COMMON *pc = &pbi->common;
- const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
- const unsigned char *partition;
-
- ptrdiff_t partition_size;
- ptrdiff_t bytes_left;
-
- // Set up pointers to token partition
- partition = cx_data;
- bytes_left = user_data_end - partition;
- partition_size = bytes_left;
-
- /* Validate the calculated partition length. If the buffer
- * described by the partition can't be fully read, then restrict
- * it to the portion that can be (for EC mode) or throw an error.
- */
- if (!read_is_valid(partition, partition_size, user_data_end)) {
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet or corrupt partition "
- "%d length", 1);
- }
-
- if (vp9_start_decode(bool_decoder, partition, partition_size))
- vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate bool decoder %d", 1);
-}
-
-static void init_frame(VP9D_COMP *pbi) {
- VP9_COMMON *const pc = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
-
- if (pc->frame_type == KEY_FRAME) {
- /* Various keyframe initializations */
- vp9_init_mv_probs(pc);
-
- vp9_init_mbmode_probs(pc);
- vp9_default_bmode_probs(pc->fc.bmode_prob);
-
- vp9_default_coef_probs(pc);
- vp9_kf_default_bmode_probs(pc->kf_bmode_prob);
-
- // Reset the segment feature data to the default stats:
- // Features disabled, 0, with delta coding (Default state).
- vp9_clearall_segfeatures(xd);
-
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
- /* reset the mode ref deltasa for loop filter */
- vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-
- /* All buffers are implicitly updated on key frames. */
- pc->refresh_golden_frame = 1;
- pc->refresh_alt_ref_frame = 1;
- pc->copy_buffer_to_gf = 0;
- pc->copy_buffer_to_arf = 0;
-
- /* Note that Golden and Altref modes cannot be used on a key frame so
- * ref_frame_sign_bias[] is undefined and meaningless
- */
- pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
- pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
-
- vp9_init_mode_contexts(&pbi->common);
- vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
- vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
-
- vpx_memcpy(pbi->common.fc.vp8_mode_contexts,
- pbi->common.fc.mode_context,
- sizeof(pbi->common.fc.mode_context));
- vpx_memset(pc->prev_mip, 0,
- (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
- vpx_memset(pc->mip, 0,
- (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
-
- vp9_update_mode_info_border(pc, pc->mip);
- vp9_update_mode_info_in_image(pc, pc->mi);
-
- } else {
-
- if (!pc->use_bilinear_mc_filter)
- pc->mcomp_filter_type = EIGHTTAP;
- else
- pc->mcomp_filter_type = BILINEAR;
-
- /* To enable choice of different interpolation filters */
- vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
- }
-
- xd->mode_info_context = pc->mi;
- xd->prev_mode_info_context = pc->prev_mi;
- xd->frame_type = pc->frame_type;
- xd->mode_info_context->mbmi.mode = DC_PRED;
- xd->mode_info_stride = pc->mode_info_stride;
- xd->corrupted = 0; /* init without corruption */
-
- xd->fullpixel_mask = 0xffffffff;
- if (pc->full_pixel)
- xd->fullpixel_mask = 0xfffffff8;
-
-}
-
-#if 0
-static void read_coef_probs2(VP9D_COMP *pbi) {
- const vp9_prob grpupd = 192;
- int i, j, k, l;
- vp9_reader *const bc = &pbi->bc;
- VP9_COMMON *const pc = &pbi->common;
- for (l = 0; l < ENTROPY_NODES; l++) {
- if (vp9_read(bc, grpupd)) {
- // printf("Decoding %d\n", l);
- for (i = 0; i < BLOCK_TYPES; i++)
- for (j = !i; j < COEF_BANDS; j++)
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- if (k >= 3 && ((i == 0 && j == 1) ||
- (i > 0 && j == 0)))
- continue;
- {
- vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;
- int u = vp9_read(bc, COEF_UPDATE_PROB);
- if (u) *p = read_prob_diff_update(bc, *p);
- }
- }
- }
- }
- if (pbi->common.txfm_mode == ALLOW_8X8) {
- for (l = 0; l < ENTROPY_NODES; l++) {
- if (vp9_read(bc, grpupd)) {
- for (i = 0; i < BLOCK_TYPES_8X8; i++)
- for (j = !i; j < COEF_BANDS; j++)
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- if (k >= 3 && ((i == 0 && j == 1) ||
- (i > 0 && j == 0)))
- continue;
- {
- vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;
-
- int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);
- if (u) *p = read_prob_diff_update(bc, *p);
- }
- }
- }
- }
- }
-}
-#endif
-
-static void read_coef_probs_common(
- BOOL_DECODER* const bc,
- vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
- int i, j, k, l;
-
- if (vp9_read_bit(bc)) {
- for (i = 0; i < BLOCK_TYPES; i++) {
- for (j = !i; j < COEF_BANDS; j++) {
- /* NB: This j loop starts from 1 on block type i == 0 */
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- if (k >= 3 && ((i == 0 && j == 1) ||
- (i > 0 && j == 0)))
- continue;
- for (l = 0; l < ENTROPY_NODES; l++) {
- vp9_prob *const p = coef_probs[i][j][k] + l;
-
- if (vp9_read(bc, COEF_UPDATE_PROB)) {
- *p = read_prob_diff_update(bc, *p);
- }
- }
- }
- }
- }
- }
-}
-
-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
- VP9_COMMON *const pc = &pbi->common;
-
- read_coef_probs_common(bc, pc->fc.coef_probs);
- read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
-
- if (pbi->common.txfm_mode != ONLY_4X4) {
- read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
- read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
- }
- if (pbi->common.txfm_mode > ALLOW_8X8) {
- read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
- read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
- }
-}
-
-int vp9_decode_frame(VP9D_COMP *pbi) {
- BOOL_DECODER header_bc, residual_bc;
- VP9_COMMON *const pc = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- const unsigned char *data = (const unsigned char *)pbi->Source;
- const unsigned char *data_end = data + pbi->source_sz;
- ptrdiff_t first_partition_length_in_bytes = 0;
-
- int mb_row;
- int i, j;
- int corrupt_tokens = 0;
-
- /* start with no corruption of current frame */
- xd->corrupted = 0;
- pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
-
- if (data_end - data < 3) {
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet");
- } else {
- pc->last_frame_type = pc->frame_type;
- pc->frame_type = (FRAME_TYPE)(data[0] & 1);
- pc->version = (data[0] >> 1) & 7;
- pc->show_frame = (data[0] >> 4) & 1;
- first_partition_length_in_bytes =
- (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
-
- if ((data + first_partition_length_in_bytes > data_end
- || data + first_partition_length_in_bytes < data))
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet or corrupt partition 0 length");
-
- data += 3;
-
- vp9_setup_version(pc);
-
- if (pc->frame_type == KEY_FRAME) {
- const int Width = pc->Width;
- const int Height = pc->Height;
-
- /* vet via sync code */
- /* When error concealment is enabled we should only check the sync
- * code if we have enough bits available
- */
- if (data + 3 < data_end) {
- if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
- vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
- "Invalid frame sync code");
- }
-
- /* If error concealment is enabled we should only parse the new size
- * if we have enough data. Otherwise we will end up with the wrong
- * size.
- */
- if (data + 6 < data_end) {
- pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
- pc->horiz_scale = data[4] >> 6;
- pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
- pc->vert_scale = data[6] >> 6;
- }
- data += 7;
-
- if (Width != pc->Width || Height != pc->Height) {
- if (pc->Width <= 0) {
- pc->Width = Width;
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
- "Invalid frame width");
- }
-
- if (pc->Height <= 0) {
- pc->Height = Height;
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
- "Invalid frame height");
- }
-
- if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
- vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate frame buffers");
- }
- }
- }
-
- if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
- pc->Width == 0 || pc->Height == 0) {
- return -1;
- }
-
- init_frame(pbi);
-
- if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))
- vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate bool decoder 0");
- if (pc->frame_type == KEY_FRAME) {
- pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
- pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
- }
-
- /* Is segmentation enabled */
- xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);
-
- if (xd->segmentation_enabled) {
- // Read whether or not the segmentation map is being explicitly
- // updated this frame.
- xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);
-
- // If so what method will be used.
- if (xd->update_mb_segmentation_map) {
- // Which macro block level features are enabled
-
- // Read the probs used to decode the segment id for each macro
- // block.
- for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
- xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?
- (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
- }
-
- // Read the prediction probs needed to decode the segment id
- pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);
- for (i = 0; i < PREDICTION_PROBS; i++) {
- if (pc->temporal_update) {
- pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?
- (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
- } else {
- pc->segment_pred_probs[i] = 255;
- }
- }
- }
- // Is the segment data being updated
- xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);
-
- if (xd->update_mb_segmentation_data) {
- int data;
-
- xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);
-
- vp9_clearall_segfeatures(xd);
-
- // For each segmentation...
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
- // For each of the segments features...
- for (j = 0; j < SEG_LVL_MAX; j++) {
- // Is the feature enabled
- if (vp9_read_bit(&header_bc)) {
- // Update the feature data and mask
- vp9_enable_segfeature(xd, i, j);
-
- data = (signed char)vp9_read_literal(
- &header_bc, vp9_seg_feature_data_bits(j));
-
- // Is the segment data signed..
- if (vp9_is_segfeature_signed(j)) {
- if (vp9_read_bit(&header_bc))
- data = - data;
- }
- } else
- data = 0;
-
- vp9_set_segdata(xd, i, j, data);
- }
- }
- }
- }
-
- // Read common prediction model status flag probability updates for the
- // reference frame
- if (pc->frame_type == KEY_FRAME) {
- // Set the prediction probabilities to defaults
- pc->ref_pred_probs[0] = 120;
- pc->ref_pred_probs[1] = 80;
- pc->ref_pred_probs[2] = 40;
- } else {
- for (i = 0; i < PREDICTION_PROBS; i++) {
- if (vp9_read_bit(&header_bc))
- pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);
- }
- }
-
-#if CONFIG_SUPERBLOCKS
- pc->sb_coded = vp9_read_literal(&header_bc, 8);
-#endif
-
- /* Read the loop filter level and type */
- pc->txfm_mode = vp9_read_literal(&header_bc, 2);
- if (pc->txfm_mode == TX_MODE_SELECT) {
- pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
- pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
- }
-
- pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
- pc->filter_level = vp9_read_literal(&header_bc, 6);
- pc->sharpness_level = vp9_read_literal(&header_bc, 3);
-
- /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
- xd->mode_ref_lf_delta_update = 0;
- xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);
-
- if (xd->mode_ref_lf_delta_enabled) {
- /* Do the deltas need to be updated */
- xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);
-
- if (xd->mode_ref_lf_delta_update) {
- /* Send update */
- for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
- if (vp9_read_bit(&header_bc)) {
- /*sign = vp9_read_bit( &header_bc );*/
- xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
-
- if (vp9_read_bit(&header_bc)) /* Apply sign */
- xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
- }
- }
-
- /* Send update */
- for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- if (vp9_read_bit(&header_bc)) {
- /*sign = vp9_read_bit( &header_bc );*/
- xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
-
- if (vp9_read_bit(&header_bc)) /* Apply sign */
- xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
- }
- }
- }
- }
-
- // Dummy read for now
- vp9_read_literal(&header_bc, 2);
-
- setup_token_decoder(pbi, data + first_partition_length_in_bytes,
- &residual_bc);
-
- /* Read the default quantizers. */
- {
- int Q, q_update;
-
- Q = vp9_read_literal(&header_bc, QINDEX_BITS);
- pc->base_qindex = Q;
- q_update = 0;
- /* AC 1st order Q = default */
- pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
- pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
- pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
- pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
- pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
-
- if (q_update)
- vp9_init_de_quantizer(pbi);
-
- /* MB level dequantizer setup */
- mb_init_dequantizer(pbi, &pbi->mb);
- }
-
- /* Determine if the golden frame or ARF buffer should be updated and how.
- * For all non key frames the GF and ARF refresh flags and sign bias
- * flags must be set explicitly.
- */
- if (pc->frame_type != KEY_FRAME) {
- /* Should the GF or ARF be updated from the current frame */
- pc->refresh_golden_frame = vp9_read_bit(&header_bc);
- pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);
-
- if (pc->refresh_alt_ref_frame) {
- vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
- vpx_memcpy(pc->fc.vp8_mode_contexts,
- pc->fc.mode_context_a,
- sizeof(pc->fc.vp8_mode_contexts));
- } else {
- vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
- vpx_memcpy(pc->fc.vp8_mode_contexts,
- pc->fc.mode_context,
- sizeof(pc->fc.vp8_mode_contexts));
- }
-
- /* Buffer to buffer copy flags. */
- pc->copy_buffer_to_gf = 0;
-
- if (!pc->refresh_golden_frame)
- pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);
-
- pc->copy_buffer_to_arf = 0;
-
- if (!pc->refresh_alt_ref_frame)
- pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);
-
- pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
- pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
-
- /* Is high precision mv allowed */
- xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
- // Read the type of subpel filter to use
- if (vp9_read_bit(&header_bc)) {
- pc->mcomp_filter_type = SWITCHABLE;
- } else {
- pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
- }
- /* To enable choice of different interploation filters */
- vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
- }
-
- pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
- if (pc->refresh_entropy_probs == 0) {
- vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
- }
-
- pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
- || vp9_read_bit(&header_bc);
-
- if (0) {
- FILE *z = fopen("decodestats.stt", "a");
- fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
- pc->current_video_frame,
- pc->frame_type,
- pc->refresh_golden_frame,
- pc->refresh_alt_ref_frame,
- pc->refresh_last_frame,
- pc->base_qindex);
- fclose(z);
- }
-
- vp9_copy(pbi->common.fc.pre_coef_probs,
- pbi->common.fc.coef_probs);
- vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
- pbi->common.fc.hybrid_coef_probs);
- vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
- pbi->common.fc.coef_probs_8x8);
- vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
- pbi->common.fc.hybrid_coef_probs_8x8);
- vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
- pbi->common.fc.coef_probs_16x16);
- vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
- pbi->common.fc.hybrid_coef_probs_16x16);
- vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
- vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
- vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
- vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
- vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);
- vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);
- pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
- vp9_zero(pbi->common.fc.coef_counts);
- vp9_zero(pbi->common.fc.hybrid_coef_counts);
- vp9_zero(pbi->common.fc.coef_counts_8x8);
- vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
- vp9_zero(pbi->common.fc.coef_counts_16x16);
- vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
- vp9_zero(pbi->common.fc.ymode_counts);
- vp9_zero(pbi->common.fc.uv_mode_counts);
- vp9_zero(pbi->common.fc.bmode_counts);
- vp9_zero(pbi->common.fc.i8x8_mode_counts);
- vp9_zero(pbi->common.fc.sub_mv_ref_counts);
- vp9_zero(pbi->common.fc.mbsplit_counts);
- vp9_zero(pbi->common.fc.NMVcount);
- vp9_zero(pbi->common.fc.mv_ref_ct);
- vp9_zero(pbi->common.fc.mv_ref_ct_a);
-
- read_coef_probs(pbi, &header_bc);
-
- vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
- vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
-
- // Create the segmentation map structure and set to 0
- if (!pc->last_frame_seg_map)
- CHECK_MEM_ERROR(pc->last_frame_seg_map,
- vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
-
- /* set up frame new frame for intra coded blocks */
- vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
-
- vp9_setup_block_dptrs(xd);
-
- vp9_build_block_doffsets(xd);
-
- /* clear out the coeff buffer */
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-
- /* Read the mb_no_coeff_skip flag */
- pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);
-
- vp9_decode_mode_mvs_init(pbi, &header_bc);
-
- vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
-
- // Resset the macroblock mode info context to the start of the list
- xd->mode_info_context = pc->mi;
- xd->prev_mode_info_context = pc->prev_mi;
-
- /* Decode a row of superblocks */
- for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
- decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
- }
- corrupt_tokens |= xd->corrupted;
-
- /* Collect information about decoder corruption. */
- /* 1. Check first boolean decoder for errors. */
- pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
- /* 2. Check the macroblock information */
- pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
-
- if (!pbi->decoded_key_frame) {
- if (pc->frame_type == KEY_FRAME &&
- !pc->yv12_fb[pc->new_fb_idx].corrupted)
- pbi->decoded_key_frame = 1;
- else
- vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
- "A stream must start with a complete key frame");
- }
-
- vp9_adapt_coef_probs(pc);
- if (pc->frame_type != KEY_FRAME) {
- vp9_adapt_mode_probs(pc);
- vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
- vp9_update_mode_context(&pbi->common);
- }
-
- /* If this was a kf or Gf note the Q used */
- if ((pc->frame_type == KEY_FRAME) ||
- pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {
- pc->last_kf_gf_q = pc->base_qindex;
- }
- if (pc->refresh_entropy_probs) {
- if (pc->refresh_alt_ref_frame)
- vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
- else
- vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
- }
-
-#ifdef PACKET_TESTING
- {
- FILE *f = fopen("decompressor.VP8", "ab");
- unsigned int size = residual_bc.pos + header_bc.pos + 8;
- fwrite((void *) &size, 4, 1, f);
- fwrite((void *) pbi->Source, size, 1, f);
- fclose(f);
- }
-#endif
- // printf("Frame %d Done\n", frame_count++);
-
- return 0;
-}
--- a/vp8/decoder/dequantize.c
+++ /dev/null
@@ -1,543 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "dequantize.h"
-#include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"
-#include "onyxd_int.h"
-
-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);
-extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);
-
-#if CONFIG_LOSSLESS
-extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,
- int pitch);
-extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,
- int pitch);
-#endif
-
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
-
-void vp9_dequantize_b_c(BLOCKD *d) {
-
- int i;
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
- short *DQC = d->dequant;
-
- for (i = 0; i < 16; i++) {
- DQ[i] = Q[i] * DQC[i];
- }
-}
-
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride) {
- short output[16];
- short *diff_ptr = output;
- int r, c;
- int i;
-
- for (i = 0; i < 16; i++) {
- input[i] = dq[i] * input[i];
- }
-
- vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);
-
- vpx_memset(input, 0, 32);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
-}
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride) {
- short output[64];
- short *diff_ptr = output;
- int b, r, c;
- int i;
- unsigned char *origdest = dest;
- unsigned char *origpred = pred;
-
- input[0] = dq[0] * input[0];
- for (i = 1; i < 64; i++) {
- input[i] = dq[1] * input[i];
- }
-
- vp9_ihtllm_c(input, output, 16, tx_type, 8);
-
- vpx_memset(input, 0, 128);
-
- for (b = 0; b < 4; b++) {
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 8;
- pred += pitch;
- }
- // shift buffer pointers to next 4x4 block in the submacroblock
- diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
- dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
- pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
- }
-}
-
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride) {
- short output[16];
- short *diff_ptr = output;
- int r, c;
- int i;
-
- for (i = 0; i < 16; i++) {
- input[i] = dq[i] * input[i];
- }
-
- /* the idct halves ( >> 1) the pitch */
- vp9_short_idct4x4llm_c(input, output, 4 << 1);
-
- vpx_memset(input, 0, 32);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
-}
-
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride,
- int Dc) {
- int i;
- short output[16];
- short *diff_ptr = output;
- int r, c;
-
- input[0] = (short)Dc;
-
- for (i = 1; i < 16; i++) {
- input[i] = dq[i] * input[i];
- }
-
- /* the idct halves ( >> 1) the pitch */
- vp9_short_idct4x4llm_c(input, output, 4 << 1);
-
- vpx_memset(input, 0, 32);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride) {
- short output[16];
- short *diff_ptr = output;
- int r, c;
- int i;
-
- for (i = 0; i < 16; i++) {
- input[i] = dq[i] * input[i];
- }
-
- vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
-
- vpx_memset(input, 0, 32);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
-}
-
-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
- unsigned char *pred,
- unsigned char *dest,
- int pitch, int stride, int dc) {
- int i;
- short output[16];
- short *diff_ptr = output;
- int r, c;
-
- input[0] = (short)dc;
-
- for (i = 1; i < 16; i++) {
- input[i] = dq[i] * input[i];
- }
-
- vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
- vpx_memset(input, 0, 32);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
-}
-#endif
-
-void vp9_dequantize_b_2x2_c(BLOCKD *d) {
- int i;
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
- short *DQC = d->dequant;
-
- for (i = 0; i < 16; i++) {
- DQ[i] = (short)((Q[i] * DQC[i]));
- }
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Dequantize 2x2\n");
- for (j = 0; j < 16; j++) printf("%d ", Q[j]);
- printf("\n");
- for (j = 0; j < 16; j++) printf("%d ", DQ[j]);
- printf("\n");
- }
-#endif
-}
-
-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride) {
- short output[64];
- short *diff_ptr = output;
- int r, c, b;
- int i;
- unsigned char *origdest = dest;
- unsigned char *origpred = pred;
-
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Input 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", input[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
-
- input[0] = input[0] * dq[0];
-
- // recover quantizer for 4 4x4 blocks
- for (i = 1; i < 64; i++) {
- input[i] = input[i] * dq[1];
- }
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Input DQ 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", input[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
-
- // the idct halves ( >> 1) the pitch
- vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Output 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", output[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
-
- vpx_memset(input, 0, 128);// test what should i put here
-
- for (b = 0; b < 4; b++) {
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 8;
- pred += pitch;
- }
- diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
- dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
- pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
- }
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int k, j;
- printf("Final 8x8\n");
- for (j = 0; j < 8; j++) {
- for (k = 0; k < 8; k++) {
- printf("%d ", origdest[k]);
- }
- printf("\n");
- origdest += stride;
- }
- }
-#endif
-}
-
-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride,
- int Dc) { // Dc for 1st order T in some rear case
- short output[64];
- short *diff_ptr = output;
- int r, c, b;
- int i;
- unsigned char *origdest = dest;
- unsigned char *origpred = pred;
-
- input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
- // dc value is recovered after dequantization, since dc need not quantization
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Input 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", input[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
- for (i = 1; i < 64; i++) {
- input[i] = input[i] * dq[1];
- }
-
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Input DQ 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", input[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
-
- // the idct halves ( >> 1) the pitch
- vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int j;
- printf("Output 8x8\n");
- for (j = 0; j < 64; j++) {
- printf("%d ", output[j]);
- if (j % 8 == 7) printf("\n");
- }
- }
-#endif
- vpx_memset(input, 0, 128);
-
- for (b = 0; b < 4; b++) {
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 8;
- pred += pitch;
- }
- diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
- dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
- pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
- }
-#ifdef DEC_DEBUG
- if (dec_debug) {
- int k, j;
- printf("Final 8x8\n");
- for (j = 0; j < 8; j++) {
- for (k = 0; k < 8; k++) {
- printf("%d ", origdest[k]);
- }
- printf("\n");
- origdest += stride;
- }
- }
-#endif
-}
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride) {
- short output[256];
- short *diff_ptr = output;
- int r, c, i;
-
- input[0]= input[0] * dq[0];
-
- // recover quantizer for 4 4x4 blocks
- for (i = 1; i < 256; i++)
- input[i] = input[i] * dq[1];
-
- // inverse hybrid transform
- vp9_ihtllm_c(input, output, 32, tx_type, 16);
-
- // the idct halves ( >> 1) the pitch
- // vp9_short_idct16x16_c(input, output, 32);
-
- vpx_memset(input, 0, 512);
-
- for (r = 0; r < 16; r++) {
- for (c = 0; c < 16; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
- else if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 16;
- pred += pitch;
- }
-}
-
-void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride) {
- short output[256];
- short *diff_ptr = output;
- int r, c, i;
-
- input[0]= input[0] * dq[0];
-
- // recover quantizer for 4 4x4 blocks
- for (i = 1; i < 256; i++)
- input[i] = input[i] * dq[1];
-
- // the idct halves ( >> 1) the pitch
- vp9_short_idct16x16_c(input, output, 32);
-
- vpx_memset(input, 0, 512);
-
- for (r = 0; r < 16; r++) {
- for (c = 0; c < 16; c++) {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
- else if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 16;
- pred += pitch;
- }
-}
--- a/vp8/decoder/dequantize.h
+++ /dev/null
@@ -1,78 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DEQUANTIZE_H
-#define DEQUANTIZE_H
-#include "vp8/common/blockd.h"
-
-#if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
- unsigned char *pred,
- unsigned char *output,
- int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
- unsigned char *pred,
- unsigned char *output,
- int pitch, int stride, int dc);
-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs,
- short *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int stride, char *eobs);
-#endif
-
-typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,
- unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,
- unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
-
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,
- unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,
- unsigned char *pre, unsigned char *dst, int stride, char *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,
- unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
- char *eobs);
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride);
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride);
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride);
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
- unsigned char *dst,
- int stride, char *eobs,
- short *dc, MACROBLOCKD *xd);
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs,
- MACROBLOCKD *xd);
-#endif
-
-#endif
--- a/vp8/decoder/detokenize.c
+++ /dev/null
@@ -1,640 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/type_aliases.h"
-#include "vp8/common/blockd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-#include "detokenize.h"
-
-#include "vp8/common/seg_common.h"
-
-#define BOOL_DATA UINT8
-
-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-
-DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {
- 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
- 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
-};
-DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {
- 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,
- 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
-};
-
-DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {
- 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
- 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
- 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
-};
-
-#define EOB_CONTEXT_NODE 0
-#define ZERO_CONTEXT_NODE 1
-#define ONE_CONTEXT_NODE 2
-#define LOW_VAL_CONTEXT_NODE 3
-#define TWO_CONTEXT_NODE 4
-#define THREE_CONTEXT_NODE 5
-#define HIGH_LOW_CONTEXT_NODE 6
-#define CAT_ONE_CONTEXT_NODE 7
-#define CAT_THREEFOUR_CONTEXT_NODE 8
-#define CAT_THREE_CONTEXT_NODE 9
-#define CAT_FIVE_CONTEXT_NODE 10
-
-#define CAT1_MIN_VAL 5
-#define CAT2_MIN_VAL 7
-#define CAT3_MIN_VAL 11
-#define CAT4_MIN_VAL 19
-#define CAT5_MIN_VAL 35
-#define CAT6_MIN_VAL 67
-#define CAT1_PROB0 159
-#define CAT2_PROB0 145
-#define CAT2_PROB1 165
-
-#define CAT3_PROB0 140
-#define CAT3_PROB1 148
-#define CAT3_PROB2 173
-
-#define CAT4_PROB0 135
-#define CAT4_PROB1 140
-#define CAT4_PROB2 155
-#define CAT4_PROB3 176
-
-#define CAT5_PROB0 130
-#define CAT5_PROB1 134
-#define CAT5_PROB2 141
-#define CAT5_PROB3 157
-#define CAT5_PROB4 180
-
-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-
-void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {
- /* Clear entropy contexts for Y2 blocks */
- if ((xd->mode_info_context->mbmi.mode != B_PRED &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV)
- || xd->mode_info_context->mbmi.txfm_size == TX_16X16
- ) {
- vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
- } else {
- vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
- }
-}
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-// #define PREV_CONTEXT_INC(val) (2+((val)>2))
-// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])
-#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])
-
-static int get_token(int v) {
- if (v < 0) v = -v;
- if (v == 0) return ZERO_TOKEN;
- else if (v == 1) return ONE_TOKEN;
- else if (v == 2) return TWO_TOKEN;
- else if (v == 3) return THREE_TOKEN;
- else if (v == 4) return FOUR_TOKEN;
- else if (v <= 6) return DCT_VAL_CATEGORY1;
- else if (v <= 10) return DCT_VAL_CATEGORY2;
- else if (v <= 18) return DCT_VAL_CATEGORY3;
- else if (v <= 34) return DCT_VAL_CATEGORY4;
- else if (v <= 66) return DCT_VAL_CATEGORY5;
- else return DCT_VAL_CATEGORY6;
-}
-
-void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,
- int block, PLANE_TYPE type,
- TX_TYPE tx_type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int eob, int seg_eob,
- FRAME_CONTEXT *fc) {
- int c, pt, token, band;
- const int *scan;
-
- switch(tx_type) {
- case ADST_DCT :
- scan = vp9_row_scan;
- break;
-
- case DCT_ADST :
- scan = vp9_col_scan;
- break;
-
- default :
- scan = vp9_default_zig_zag1d;
- break;
- }
-
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- for (c = !type; c < eob; ++c) {
- int rc = scan[c];
- int v = qcoeff_ptr[rc];
- band = vp9_coef_bands[c];
- token = get_token(v);
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts[type][band][pt][token]++;
- else
- fc->coef_counts[type][band][pt][token]++;
- pt = vp9_prev_token_class[token];
- }
-
- if (eob < seg_eob) {
- band = vp9_coef_bands[c];
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
- else
- fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
- }
-}
-
-void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int eob, int seg_eob, FRAME_CONTEXT *const fc) {
- int c, pt, token, band;
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- for (c = !type; c < eob; ++c) {
- int rc = vp9_default_zig_zag1d[c];
- int v = qcoeff_ptr[rc];
- band = vp9_coef_bands[c];
- token = get_token(v);
- fc->coef_counts[type][band][pt][token]++;
- pt = vp9_prev_token_class[token];
- }
- if (eob < seg_eob) {
- band = vp9_coef_bands[c];
- fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
- }
-}
-
-void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
- TX_TYPE tx_type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int eob, int seg_eob, FRAME_CONTEXT *fc) {
- int c, pt, token, band;
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- for (c = !type; c < eob; ++c) {
- int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);
- int v = qcoeff_ptr[rc];
- band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
- token = get_token(v);
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts_8x8[type][band][pt][token]++;
- else
- fc->coef_counts_8x8[type][band][pt][token]++;
- pt = vp9_prev_token_class[token];
- }
- if (eob < seg_eob) {
- band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
- else
- fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
- }
-}
-
-void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
- TX_TYPE tx_type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int eob, int seg_eob, FRAME_CONTEXT *fc) {
- int c, pt, token;
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- for (c = !type; c < eob; ++c) {
- int rc = vp9_default_zig_zag1d_16x16[c];
- int v = qcoeff_ptr[rc];
- int band = vp9_coef_bands_16x16[c];
- token = get_token(v);
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts_16x16[type][band][pt][token]++;
- else
- fc->coef_counts_16x16[type][band][pt][token]++;
- pt = vp9_prev_token_class[token];
- }
- if (eob < seg_eob) {
- int band = vp9_coef_bands_16x16[c];
- if (tx_type != DCT_DCT)
- fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
- else
- fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
- }
-}
-
-static int get_signed(BOOL_DECODER *br, int value_to_sign) {
- const int split = (br->range + 1) >> 1;
- const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
- int v;
-
- if (br->count < 0)
- vp9_bool_decoder_fill(br);
-
- if (br->value < bigsplit) {
- br->range = split;
- v = value_to_sign;
- } else {
- br->range = br->range - split;
- br->value = br->value - bigsplit;
- v = -value_to_sign;
- }
- br->range += br->range;
- br->value += br->value;
- --br->count;
-
- return v;
-}
-
-#define WRITE_COEF_CONTINUE(val) \
- { \
- prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\
- qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val); \
- c++; \
- continue; \
- }
-
-#define ADJUST_COEF(prob, bits_count) \
- do { \
- if (vp9_read(br, prob)) \
- val += (UINT16)(1 << bits_count);\
- } while (0);
-
-static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
- BOOL_DECODER* const br,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- PLANE_TYPE type,
- TX_TYPE tx_type,
- int seg_eob, INT16 *qcoeff_ptr, int i,
- const int *const scan, int block_type,
- const int *coef_bands) {
- FRAME_CONTEXT *const fc = &dx->common.fc;
- int tmp, c = (type == PLANE_TYPE_Y_NO_DC);
- const vp9_prob *prob, *coef_probs;
-
- switch (block_type) {
- default:
- case TX_4X4:
- coef_probs =
- tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :
- fc->coef_probs[type][0][0];
- break;
- case TX_8X8:
- coef_probs =
- tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :
- fc->coef_probs_8x8[type][0][0];
- break;
- case TX_16X16:
- coef_probs =
- tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :
- fc->coef_probs_16x16[type][0][0];
- break;
- }
-
- VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);
- prob = coef_probs + tmp * ENTROPY_NODES;
-
- while (1) {
- int val;
- const uint8_t *cat6 = cat6_prob;
- if (c == seg_eob) break;
- prob += coef_bands[c];
- if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
- break;
-SKIP_START:
- if (c == seg_eob) break;
- if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
- ++c;
- prob = coef_probs + coef_bands[c];
- goto SKIP_START;
- }
- // ONE_CONTEXT_NODE_0_
- if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {
- prob = coef_probs + ENTROPY_NODES;
- qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);
- ++c;
- continue;
- }
- // LOW_VAL_CONTEXT_NODE_0_
- if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {
- if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {
- WRITE_COEF_CONTINUE(2);
- }
- if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {
- WRITE_COEF_CONTINUE(3);
- }
- WRITE_COEF_CONTINUE(4);
- }
- // HIGH_LOW_CONTEXT_NODE_0_
- if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {
- if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {
- val = CAT1_MIN_VAL;
- ADJUST_COEF(CAT1_PROB0, 0);
- WRITE_COEF_CONTINUE(val);
- }
- val = CAT2_MIN_VAL;
- ADJUST_COEF(CAT2_PROB1, 1);
- ADJUST_COEF(CAT2_PROB0, 0);
- WRITE_COEF_CONTINUE(val);
- }
- // CAT_THREEFOUR_CONTEXT_NODE_0_
- if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
- if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {
- val = CAT3_MIN_VAL;
- ADJUST_COEF(CAT3_PROB2, 2);
- ADJUST_COEF(CAT3_PROB1, 1);
- ADJUST_COEF(CAT3_PROB0, 0);
- WRITE_COEF_CONTINUE(val);
- }
- val = CAT4_MIN_VAL;
- ADJUST_COEF(CAT4_PROB3, 3);
- ADJUST_COEF(CAT4_PROB2, 2);
- ADJUST_COEF(CAT4_PROB1, 1);
- ADJUST_COEF(CAT4_PROB0, 0);
- WRITE_COEF_CONTINUE(val);
- }
- // CAT_FIVE_CONTEXT_NODE_0_:
- if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {
- val = CAT5_MIN_VAL;
- ADJUST_COEF(CAT5_PROB4, 4);
- ADJUST_COEF(CAT5_PROB3, 3);
- ADJUST_COEF(CAT5_PROB2, 2);
- ADJUST_COEF(CAT5_PROB1, 1);
- ADJUST_COEF(CAT5_PROB0, 0);
- WRITE_COEF_CONTINUE(val);
- }
- val = 0;
- while (*cat6) {
- val = (val << 1) | vp9_read(br, *cat6++);
- }
- val += CAT6_MIN_VAL;
- WRITE_COEF_CONTINUE(val);
- }
-
- if (block_type == TX_4X4) {
- count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,
- tx_type,
- a, l, c, seg_eob, fc);
- }
- else if (block_type == TX_8X8)
- count_tokens_8x8(qcoeff_ptr, i, type,
- tx_type,
- a, l, c, seg_eob, fc);
- else
- count_tokens_16x16(qcoeff_ptr, i, type,
- tx_type,
- a, l, c, seg_eob, fc);
- return c;
-}
-
-int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
- BOOL_DECODER* const bc) {
- ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-
- char* const eobs = xd->eobs;
- PLANE_TYPE type;
- int c, i, eobtotal = 0, seg_eob;
- const int segment_id = xd->mode_info_context->mbmi.segment_id;
- const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
- INT16 *qcoeff_ptr = &xd->qcoeff[0];
- TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);
-
- type = PLANE_TYPE_Y_WITH_DC;
-
- if (seg_active)
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- else
- seg_eob = 256;
-
- // Luma block
- {
- const int* const scan = vp9_default_zig_zag1d_16x16;
- c = decode_coefs(pbi, xd, bc, A, L, type,
- tx_type,
- seg_eob, qcoeff_ptr,
- 0, scan, TX_16X16, coef_bands_x_16x16);
- eobs[0] = c;
- A[0] = L[0] = (c != !type);
- A[1] = A[2] = A[3] = A[0];
- L[1] = L[2] = L[3] = L[0];
- eobtotal += c;
- }
-
- // 8x8 chroma blocks
- qcoeff_ptr += 256;
- type = PLANE_TYPE_UV;
- tx_type = DCT_DCT;
- if (seg_active)
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- else
- seg_eob = 64;
- for (i = 16; i < 24; i += 4) {
- ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
- ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
- const int* const scan = vp9_default_zig_zag1d_8x8;
-
- c = decode_coefs(pbi, xd, bc, a, l, type,
- tx_type,
- seg_eob, qcoeff_ptr,
- i, scan, TX_8X8, coef_bands_x_8x8);
- a[0] = l[0] = ((eobs[i] = c) != !type);
- a[1] = a[0];
- l[1] = l[0];
-
- eobtotal += c;
- qcoeff_ptr += 64;
- }
- vpx_memset(&A[8], 0, sizeof(A[8]));
- vpx_memset(&L[8], 0, sizeof(L[8]));
- return eobtotal;
-}
-
-int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
- BOOL_DECODER* const bc) {
- ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-
- char *const eobs = xd->eobs;
- PLANE_TYPE type;
- int c, i, eobtotal = 0, seg_eob;
- const int segment_id = xd->mode_info_context->mbmi.segment_id;
- const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
- INT16 *qcoeff_ptr = &xd->qcoeff[0];
- TX_TYPE tx_type = DCT_DCT;
-
- int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
- xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;
- if (xd->mode_info_context->mbmi.mode != B_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED) {
- ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
- ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
- const int *const scan = vp9_default_zig_zag1d;
- type = PLANE_TYPE_Y2;
-
- if (seg_active)
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- else
- seg_eob = 4;
- c = decode_coefs(pbi, xd, bc, a, l, type,
- tx_type,
- seg_eob, qcoeff_ptr + 24 * 16,
- 24, scan, TX_8X8, coef_bands_x);
- a[0] = l[0] = ((eobs[24] = c) != !type);
-
- eobtotal += c - 4;
-
- type = PLANE_TYPE_Y_NO_DC;
- } else
- type = PLANE_TYPE_Y_WITH_DC;
-
- if (seg_active)
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- else
- seg_eob = 64;
-
- for (i = 0; i < bufthred ; i += 4) {
- ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
- ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
- const int *const scan = vp9_default_zig_zag1d_8x8;
- tx_type = DCT_DCT;
-
- if (i == 16)
- type = PLANE_TYPE_UV;
- if (type == PLANE_TYPE_Y_WITH_DC) {
- tx_type = get_tx_type(xd, xd->block + i);
- }
-
- c = decode_coefs(pbi, xd, bc, a, l, type,
- tx_type,
- seg_eob, qcoeff_ptr,
- i, scan, TX_8X8, coef_bands_x_8x8);
- a[0] = l[0] = ((eobs[i] = c) != !type);
- a[1] = a[0];
- l[1] = l[0];
-
- eobtotal += c;
- qcoeff_ptr += 64;
- }
-
- if (bufthred == 16) {
- type = PLANE_TYPE_UV;
- tx_type = DCT_DCT;
- seg_eob = 16;
-
- // use 4x4 transform for U, V components in I8X8 prediction mode
- for (i = 16; i < 24; i++) {
- ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
- ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
- const int *scan = vp9_default_zig_zag1d;
-
- c = decode_coefs(pbi, xd, bc, a, l, type,
- tx_type,
- seg_eob, qcoeff_ptr,
- i, scan, TX_4X4, coef_bands_x);
- a[0] = l[0] = ((eobs[i] = c) != !type);
-
- eobtotal += c;
- qcoeff_ptr += 16;
- }
- }
-
- return eobtotal;
-}
-
-
-int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,
- BOOL_DECODER* const bc) {
- ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-
- char *const eobs = xd->eobs;
- const int *scan = vp9_default_zig_zag1d;
- PLANE_TYPE type;
- int c, i, eobtotal = 0, seg_eob = 16;
- INT16 *qcoeff_ptr = &xd->qcoeff[0];
-
- int segment_id = xd->mode_info_context->mbmi.segment_id;
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
- if (xd->mode_info_context->mbmi.mode != B_PRED &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV) {
- ENTROPY_CONTEXT *const a = A + vp9_block2above[24];
- ENTROPY_CONTEXT *const l = L + vp9_block2left[24];
- type = PLANE_TYPE_Y2;
-
- c = decode_coefs(dx, xd, bc, a, l, type,
- DCT_DCT,
- seg_eob, qcoeff_ptr + 24 * 16, 24,
- scan, TX_4X4, coef_bands_x);
- a[0] = l[0] = ((eobs[24] = c) != !type);
- eobtotal += c - 16;
-
- type = PLANE_TYPE_Y_NO_DC;
- } else {
- type = PLANE_TYPE_Y_WITH_DC;
- }
-
- for (i = 0; i < 24; ++i) {
- ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
- ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
- TX_TYPE tx_type = DCT_DCT;
- if (i == 16)
- type = PLANE_TYPE_UV;
-
- tx_type = get_tx_type(xd, &xd->block[i]);
- switch(tx_type) {
- case ADST_DCT :
- scan = vp9_row_scan;
- break;
-
- case DCT_ADST :
- scan = vp9_col_scan;
- break;
-
- default :
- scan = vp9_default_zig_zag1d;
- break;
- }
-
- c = decode_coefs(dx, xd, bc, a, l, type, tx_type,
- seg_eob, qcoeff_ptr,
- i, scan, TX_4X4, coef_bands_x);
- a[0] = l[0] = ((eobs[i] = c) != !type);
-
- eobtotal += c;
- qcoeff_ptr += 16;
- }
-
- return eobtotal;
-}
--- a/vp8/decoder/detokenize.h
+++ /dev/null
@@ -1,25 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DETOKENIZE_H
-#define DETOKENIZE_H
-
-#include "onyxd_int.h"
-
-void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
-int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
- BOOL_DECODER* const);
-int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,
- BOOL_DECODER* const);
-int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,
- BOOL_DECODER* const);
-
-#endif /* DETOKENIZE_H */
--- a/vp8/decoder/idct_blk.c
+++ /dev/null
@@ -1,292 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "dequantize.h"
-
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride,
- int Dc);
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride);
-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
- unsigned char *dst_ptr, int pitch, int stride);
-#if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
- unsigned char *pred, unsigned char *dest,
- int pitch, int stride);
-void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,
- unsigned char *dst_ptr,
- int pitch, int stride);
-#endif
-
-void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs,
- short *dc) {
- int i, j;
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- if (*eobs++ > 1)
- vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);
-
- q += 16;
- pre += 4;
- dst += 4;
- dc++;
- }
-
- pre += 64 - 16;
- dst += 4 * stride - 16;
- }
-}
-
-void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs) {
- int i, j;
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
- else {
- vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dst += 4;
- }
-
- pre += 64 - 16;
- dst += 4 * stride - 16;
- }
-}
-
-void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv,
- int stride, char *eobs) {
- int i, j;
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
- else {
- vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dstu += 4;
- }
-
- pre += 32 - 8;
- dstu += 4 * stride - 8;
- }
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
- else {
- vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dstv += 4;
- }
-
- pre += 32 - 8;
- dstv += 4 * stride - 8;
- }
-}
-
-
-void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs, short *dc,
- MACROBLOCKD *xd) {
- vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
- vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
- vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
- dst + 8 * stride, 16, stride, dc[4]);
- vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
- dst + 8 * stride + 8, 16, stride, dc[8]);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
- unsigned char *dst,
- int stride, char *eobs,
- short *dc, MACROBLOCKD *xd) {
- vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
- vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
- dst + 8, stride, stride, dc[1]);
- vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
- dst + 8 * stride, stride, stride, dc[4]);
- vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
- dst + 8 * stride + 8, stride, stride, dc[8]);
-}
-#endif
-
-void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs,
- MACROBLOCKD *xd) {
- unsigned char *origdest = dst;
- unsigned char *origpred = pre;
-
- vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
- vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
- origdest + 8, 16, stride);
- vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
- origdest + 8 * stride, 16, stride);
- vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
- origdest + 8 * stride + 8, 16, stride);
-}
-
-void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs,
- MACROBLOCKD *xd) {
- vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
-
- q += 64;
- pre += 64;
-
- vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs,
- MACROBLOCKD *xd) {
- vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
-
- q += 64;
-
- vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
-}
-#endif
-
-#if CONFIG_LOSSLESS
-void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs,
- short *dc) {
- int i, j;
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- if (*eobs++ > 1)
- vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);
-
- q += 16;
- pre += 4;
- dst += 4;
- dc++;
- }
-
- pre += 64 - 16;
- dst += 4 * stride - 16;
- }
-}
-
-void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs) {
- int i, j;
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);
- else {
- vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dst += 4;
- }
-
- pre += 64 - 16;
- dst += 4 * stride - 16;
- }
-}
-
-void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs) {
- int i, j;
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);
- else {
- vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dstu += 4;
- }
-
- pre += 32 - 8;
- dstu += 4 * stride - 8;
- }
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- if (*eobs++ > 1)
- vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);
- else {
- vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- q += 16;
- pre += 4;
- dstv += 4;
- }
-
- pre += 32 - 8;
- dstv += 4 * stride - 8;
- }
-}
-#endif
-
--- a/vp8/decoder/onyxd_if.c
+++ /dev/null
@@ -1,506 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#if CONFIG_POSTPROC
-#include "vp8/common/postproc.h"
-#endif
-#include "vp8/common/onyxd.h"
-#include "onyxd_int.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/alloccommon.h"
-#include "vpx_scale/yv12extend.h"
-#include "vp8/common/loopfilter.h"
-#include "vp8/common/swapyv12buffer.h"
-#include <stdio.h>
-#include <assert.h>
-
-#include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_ports/vpx_timer.h"
-#include "detokenize.h"
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-extern void vp9_init_de_quantizer(VP9D_COMP *pbi);
-static int get_free_fb(VP9_COMMON *cm);
-static void ref_cnt_fb(int *buf, int *idx, int new_idx);
-
-#if CONFIG_DEBUG
-static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
- FILE *yuv_file = fopen((char *)name, "ab");
- unsigned char *src = s->y_buffer;
- int h = s->y_height;
-
- do {
- fwrite(src, s->y_width, 1, yuv_file);
- src += s->y_stride;
- } while (--h);
-
- src = s->u_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-
- src = s->v_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-
- fclose(yuv_file);
-}
-#endif
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
- // write the frame
- FILE *yframe;
- int i;
- char filename[255];
-
- sprintf(filename, "dx\\y%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->y_height; i++)
- fwrite(frame->y_buffer + i * frame->y_stride,
- frame->y_width, 1, yframe);
-
- fclose(yframe);
- sprintf(filename, "dx\\u%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->u_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
- sprintf(filename, "dx\\v%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->v_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
-}
-#endif
-
-void vp9_initialize_dec(void) {
- static int init_done = 0;
-
- if (!init_done) {
- vp9_initialize_common();
- vp9_init_quant_tables();
- vp8_scale_machine_specific_config();
- init_done = 1;
- }
-}
-
-VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
- VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));
-
- if (!pbi)
- return NULL;
-
- vpx_memset(pbi, 0, sizeof(VP9D_COMP));
-
- if (setjmp(pbi->common.error.jmp)) {
- pbi->common.error.setjmp = 0;
- vp9_remove_decompressor(pbi);
- return 0;
- }
-
- pbi->common.error.setjmp = 1;
- vp9_initialize_dec();
-
- vp9_create_common(&pbi->common);
-
- pbi->common.current_video_frame = 0;
- pbi->ready_for_new_data = 1;
-
- /* vp9_init_de_quantizer() is first called here. Add check in
- * frame_init_dequantizer() to avoid unnecessary calling of
- * vp9_init_de_quantizer() for every frame.
- */
- vp9_init_de_quantizer(pbi);
-
- vp9_loop_filter_init(&pbi->common);
-
- pbi->common.error.setjmp = 0;
-
- pbi->decoded_key_frame = 0;
-
- return (VP9D_PTR) pbi;
-}
-
-void vp9_remove_decompressor(VP9D_PTR ptr) {
- VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-
- if (!pbi)
- return;
-
- // Delete sementation map
- if (pbi->common.last_frame_seg_map != 0)
- vpx_free(pbi->common.last_frame_seg_map);
-
- vp9_remove_common(&pbi->common);
- vpx_free(pbi->mbc);
- vpx_free(pbi);
-}
-
-
-vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd) {
- VP9D_COMP *pbi = (VP9D_COMP *) ptr;
- VP9_COMMON *cm = &pbi->common;
- int ref_fb_idx;
-
- if (ref_frame_flag == VP9_LAST_FLAG)
- ref_fb_idx = cm->lst_fb_idx;
- else if (ref_frame_flag == VP9_GOLD_FLAG)
- ref_fb_idx = cm->gld_fb_idx;
- else if (ref_frame_flag == VP9_ALT_FLAG)
- ref_fb_idx = cm->alt_fb_idx;
- else {
- vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
- "Invalid reference frame");
- return pbi->common.error.error_code;
- }
-
- if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
- cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
- cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
- cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {
- vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
- "Incorrect buffer dimensions");
- } else
- vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
-
- return pbi->common.error.error_code;
-}
-
-
-vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd) {
- VP9D_COMP *pbi = (VP9D_COMP *) ptr;
- VP9_COMMON *cm = &pbi->common;
- int *ref_fb_ptr = NULL;
- int free_fb;
-
- if (ref_frame_flag == VP9_LAST_FLAG)
- ref_fb_ptr = &cm->lst_fb_idx;
- else if (ref_frame_flag == VP9_GOLD_FLAG)
- ref_fb_ptr = &cm->gld_fb_idx;
- else if (ref_frame_flag == VP9_ALT_FLAG)
- ref_fb_ptr = &cm->alt_fb_idx;
- else {
- vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
- "Invalid reference frame");
- return pbi->common.error.error_code;
- }
-
- if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
- cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
- cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
- cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {
- vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
- "Incorrect buffer dimensions");
- } else {
- /* Find an empty frame buffer. */
- free_fb = get_free_fb(cm);
- /* Decrease fb_idx_ref_cnt since it will be increased again in
- * ref_cnt_fb() below. */
- cm->fb_idx_ref_cnt[free_fb]--;
-
- /* Manage the reference counters and copy image. */
- ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
- vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);
- }
-
- return pbi->common.error.error_code;
-}
-
-/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
-#if HAVE_ARMV7
-extern void vp9_push_neon(int64_t *store);
-extern void vp9_pop_neon(int64_t *store);
-#endif
-
-static int get_free_fb(VP9_COMMON *cm) {
- int i;
- for (i = 0; i < NUM_YV12_BUFFERS; i++)
- if (cm->fb_idx_ref_cnt[i] == 0)
- break;
-
- assert(i < NUM_YV12_BUFFERS);
- cm->fb_idx_ref_cnt[i] = 1;
- return i;
-}
-
-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
- if (buf[*idx] > 0)
- buf[*idx]--;
-
- *idx = new_idx;
-
- buf[new_idx]++;
-}
-
-/* If any buffer copy / swapping is signalled it should be done here. */
-static int swap_frame_buffers(VP9_COMMON *cm) {
- int err = 0;
-
- /* The alternate reference frame or golden frame can be updated
- * using the new, last, or golden/alt ref frame. If it
- * is updated using the newly decoded frame it is a refresh.
- * An update using the last or golden/alt ref frame is a copy.
- */
- if (cm->copy_buffer_to_arf) {
- int new_fb = 0;
-
- if (cm->copy_buffer_to_arf == 1)
- new_fb = cm->lst_fb_idx;
- else if (cm->copy_buffer_to_arf == 2)
- new_fb = cm->gld_fb_idx;
- else
- err = -1;
-
- ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
- }
-
- if (cm->copy_buffer_to_gf) {
- int new_fb = 0;
-
- if (cm->copy_buffer_to_gf == 1)
- new_fb = cm->lst_fb_idx;
- else if (cm->copy_buffer_to_gf == 2)
- new_fb = cm->alt_fb_idx;
- else
- err = -1;
-
- ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
- }
-
- if (cm->refresh_golden_frame)
- ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
-
- if (cm->refresh_alt_ref_frame)
- ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
-
- if (cm->refresh_last_frame) {
- ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
-
- cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
- } else
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
-
- cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-
- return err;
-}
-
-int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
- const unsigned char *source,
- int64_t time_stamp) {
-#if HAVE_ARMV7
- int64_t dx_store_reg[8];
-#endif
- VP9D_COMP *pbi = (VP9D_COMP *) ptr;
- VP9_COMMON *cm = &pbi->common;
- int retcode = 0;
-
- /*if(pbi->ready_for_new_data == 0)
- return -1;*/
-
- if (ptr == 0) {
- return -1;
- }
-
- pbi->common.error.error_code = VPX_CODEC_OK;
-
- pbi->Source = source;
- pbi->source_sz = size;
-
- if (pbi->source_sz == 0) {
- /* This is used to signal that we are missing frames.
- * We do not know if the missing frame(s) was supposed to update
- * any of the reference buffers, but we act conservative and
- * mark only the last buffer as corrupted.
- */
- cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
- }
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_push_neon(dx_store_reg);
- }
-#endif
-
- cm->new_fb_idx = get_free_fb(cm);
-
- if (setjmp(pbi->common.error.jmp)) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(dx_store_reg);
- }
-#endif
- pbi->common.error.setjmp = 0;
-
- /* We do not know if the missing frame(s) was supposed to update
- * any of the reference buffers, but we act conservative and
- * mark only the last buffer as corrupted.
- */
- cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
- if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
- cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
- return -1;
- }
-
- pbi->common.error.setjmp = 1;
-
- retcode = vp9_decode_frame(pbi);
-
- if (retcode < 0) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(dx_store_reg);
- }
-#endif
- pbi->common.error.error_code = VPX_CODEC_ERROR;
- pbi->common.error.setjmp = 0;
- if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
- cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
- return retcode;
- }
-
- {
- if (swap_frame_buffers(cm)) {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(dx_store_reg);
- }
-#endif
- pbi->common.error.error_code = VPX_CODEC_ERROR;
- pbi->common.error.setjmp = 0;
- return -1;
- }
-
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame);
- else
- write_dx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 1000);
-#endif
-
- if (cm->filter_level) {
- /* Apply the loop filter if appropriate. */
- vp9_loop_filter_frame(cm, &pbi->mb);
- }
- vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
- }
-
-#if CONFIG_DEBUG
- if (cm->show_frame)
- recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
-#endif
-
- vp9_clear_system_state();
-
- if (cm->show_frame) {
- vpx_memcpy(cm->prev_mip, cm->mip,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
- } else {
- vpx_memset(cm->prev_mip, 0,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
- }
-
- /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
- cm->current_video_frame);*/
-
- if (cm->show_frame)
- cm->current_video_frame++;
-
- pbi->ready_for_new_data = 0;
- pbi->last_time_stamp = time_stamp;
- pbi->source_sz = 0;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(dx_store_reg);
- }
-#endif
- pbi->common.error.setjmp = 0;
- return retcode;
-}
-
-int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
- int64_t *time_stamp, int64_t *time_end_stamp,
- vp9_ppflags_t *flags) {
- int ret = -1;
- VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-
- if (pbi->ready_for_new_data == 1)
- return ret;
-
- /* ie no raw frame to show!!! */
- if (pbi->common.show_frame == 0)
- return ret;
-
- pbi->ready_for_new_data = 1;
- *time_stamp = pbi->last_time_stamp;
- *time_end_stamp = 0;
-
- sd->clrtype = pbi->common.clr_type;
-#if CONFIG_POSTPROC
- ret = vp9_post_proc_frame(&pbi->common, sd, flags);
-#else
-
- if (pbi->common.frame_to_show) {
- *sd = *pbi->common.frame_to_show;
- sd->y_width = pbi->common.Width;
- sd->y_height = pbi->common.Height;
- sd->uv_height = pbi->common.Height / 2;
- ret = 0;
- } else {
- ret = -1;
- }
-
-#endif /*!CONFIG_POSTPROC*/
- vp9_clear_system_state();
- return ret;
-}
--- a/vp8/decoder/onyxd_int.h
+++ /dev/null
@@ -1,106 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYXD_INT_H
-#define __INC_ONYXD_INT_H
-#include "vpx_ports/config.h"
-#include "vp8/common/onyxd.h"
-#include "treereader.h"
-#include "vp8/common/onyxc_int.h"
-#include "dequantize.h"
-
-// #define DEC_DEBUG
-
-typedef struct {
- int ithread;
- void *ptr1;
- void *ptr2;
-} DECODETHREAD_DATA;
-
-typedef struct {
- MACROBLOCKD mbd;
- int mb_row;
- int current_mb_col;
- short *coef_ptr;
-} MB_ROW_DEC;
-
-typedef struct {
- int const *scan;
- int const *scan_8x8;
- UINT8 const *ptr_block2leftabove;
- vp9_tree_index const *vp9_coef_tree_ptr;
- unsigned char *norm_ptr;
- UINT8 *ptr_coef_bands_x;
- UINT8 *ptr_coef_bands_x_8x8;
-
- ENTROPY_CONTEXT_PLANES *A;
- ENTROPY_CONTEXT_PLANES *L;
-
- INT16 *qcoeff_start_ptr;
-
- vp9_prob const *coef_probs[BLOCK_TYPES];
- vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
- vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
-
- UINT8 eob[25];
-
-} DETOK;
-
-typedef struct VP9Decompressor {
- DECLARE_ALIGNED(16, MACROBLOCKD, mb);
-
- DECLARE_ALIGNED(16, VP9_COMMON, common);
-
- VP9D_CONFIG oxcf;
-
-
- const unsigned char *Source;
- unsigned int source_sz;
-
- vp9_reader *mbc;
- int64_t last_time_stamp;
- int ready_for_new_data;
-
- DETOK detoken;
-
- vp9_dequant_idct_add_fn_t idct_add;
- vp9_dequant_dc_idct_add_fn_t dc_idct_add;
- vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
- vp9_dequant_idct_add_y_block_fn_t idct_add_y_block;
- vp9_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
-
- vp9_prob prob_skip_false;
-
- int decoded_key_frame;
-
-} VP9D_COMP;
-
-int vp9_decode_frame(VP9D_COMP *cpi);
-
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
-
-#endif // __INC_ONYXD_INT_H
--- a/vp8/decoder/reconintra_mt.h
+++ /dev/null
@@ -1,15 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RECONINTRA_MT_H
-#define __INC_RECONINTRA_MT_H
-
-#endif
--- a/vp8/decoder/treereader.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef tree_reader_h
-#define tree_reader_h 1
-
-#include "vp8/common/treecoder.h"
-
-#include "dboolhuff.h"
-
-typedef BOOL_DECODER vp9_reader;
-
-#define vp9_read decode_bool
-#define vp9_read_literal decode_value
-#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)
-
-/* Intent of tree data structure is to make decoding trivial. */
-
-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
- vp9_tree t,
- const vp9_prob *const p) {
- register vp9_tree_index i = 0;
-
- while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
-
- return -i;
-}
-
-#endif /* tree_reader_h */
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ /dev/null
@@ -1,406 +1,0 @@
-;
-; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2: times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16: times 4 dw 16
-
-SECTION .text
-
-INIT_MMX
-
-
-;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
- mova m1, [sqq]
- pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers.
- mova [dqq+ 0], m1
-
- mova m1, [sqq+8]
- pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers.
- mova [dqq+ 8], m1
-
- mova m1, [sqq+16]
- pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers.
- mova [dqq+16], m1
-
- mova m1, [sqq+24]
- pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers.
- mova [dqq+24], m1
- RET
-
-
-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
-cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
-
-%if ARCH_X86_64
- movsxd strideq, dword stridem
- movsxd pitq, dword pitm
-%else
- mov strideq, stridem
- mov pitq, pitm
-%endif
-
- mova m0, [inpq+ 0]
- pmullw m0, [dqq]
-
- mova m1, [inpq+ 8]
- pmullw m1, [dqq+ 8]
-
- mova m2, [inpq+16]
- pmullw m2, [dqq+16]
-
- mova m3, [inpq+24]
- pmullw m3, [dqq+24]
-
- pxor m7, m7
- mova [inpq], m7
- mova [inpq+8], m7
- mova [inpq+16], m7
- mova [inpq+24], m7
-
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2];
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1];
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova m3, m5 ; 33 23 13 03
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2];
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1];
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- paddw m0, [pw_16]
-
- paddw m2, [pw_16]
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
- psraw m2, 5
-
- psraw m0, 5
- psraw m4, 5
-
- psraw m6, 5
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- pxor m7, m7
-
- movh m4, [predq]
- punpcklbw m4, m7
- paddsw m0, m4
- packuswb m0, m7
- movh [destq], m0
-
- movh m4, [predq+pitq]
- punpcklbw m4, m7
- paddsw m1, m4
- packuswb m1, m7
- movh [destq+strideq], m1
-
- movh m4, [predq+2*pitq]
- punpcklbw m4, m7
- paddsw m2, m4
- packuswb m2, m7
- movh [destq+strideq*2], m2
-
- add destq, strideq
- add predq, pitq
-
- movh m4, [predq+2*pitq]
- punpcklbw m4, m7
- paddsw m5, m4
- packuswb m5, m7
- movh [destq+strideq*2], m5
- RET
-
-
-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
-cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
-
-%if ARCH_X86_64
- movsxd strideq, dword stridem
- movsxd pitq, dword pitm
-%else
- mov strideq, stridem
- mov pitq, pitm
-%endif
-
- mov Dcq, Dcm
- mova m0, [inpq+ 0]
- pmullw m0, [dqq+ 0]
-
- mova m1, [inpq+ 8]
- pmullw m1, [dqq+ 8]
-
- mova m2, [inpq+16]
- pmullw m2, [dqq+16]
-
- mova m3, [inpq+24]
- pmullw m3, [dqq+24]
-
- pxor m7, m7
- mova [inpq+ 0], m7
- mova [inpq+ 8], m7
- mova [inpq+16], m7
- mova [inpq+24], m7
-
- ; move lower word of Dc to lower word of m0
- psrlq m0, 16
- psllq m0, 16
- and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision.
- movh m7, Dcq
- por m0, m7
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2];
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1];
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova m3, m5 ; 33 23 13 03
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2];
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1];
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- paddw m0, [pw_16]
-
- paddw m2, [pw_16]
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
- psraw m2, 5
-
- psraw m0, 5
- psraw m4, 5
-
- psraw m6, 5
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- pxor m7, m7
-
- movh m4, [predq]
- punpcklbw m4, m7
- paddsw m0, m4
- packuswb m0, m7
- movh [destq], m0
-
- movh m4, [predq+pitq]
- punpcklbw m4, m7
- paddsw m1, m4
- packuswb m1, m7
- movh [destq+strideq], m1
-
- movh m4, [predq+2*pitq]
- punpcklbw m4, m7
- paddsw m2, m4
- packuswb m2, m7
- movh [destq+strideq*2], m2
-
- add destq, strideq
- add predq, pitq
-
- movh m4, [predq+2*pitq]
- punpcklbw m4, m7
- paddsw m5, m4
- packuswb m5, m7
- movh [destq+strideq*2], m5
- RET
-
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ /dev/null
@@ -1,143 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs, short *dc) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (eobs[0] > 1)
- vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);
-
- if (eobs[1] > 1)
- vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,
- dst + 4, 16, stride, dc[1]);
- else
- vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);
-
- if (eobs[2] > 1)
- vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,
- dst + 8, 16, stride, dc[2]);
- else
- vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);
-
- if (eobs[3] > 1)
- vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,
- dst + 12, 16, stride, dc[3]);
- else
- vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);
-
- q += 64;
- dc += 4;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (eobs[0] > 1)
- vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- if (eobs[2] > 1)
- vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
- ((int *)(q + 32))[0] = 0;
- }
-
- if (eobs[3] > 1)
- vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
- ((int *)(q + 48))[0] = 0;
- }
-
- q += 64;
- pre += 64;
- dst += 4 * stride;
- eobs += 4;
- }
-}
-
-void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 2; i++) {
- if (eobs[0] > 1)
- vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstu += 4 * stride;
- eobs += 2;
- }
-
- for (i = 0; i < 2; i++) {
- if (eobs[0] > 1)
- vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);
- ((int *)q)[0] = 0;
- }
-
- if (eobs[1] > 1)
- vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);
- else {
- vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
- ((int *)(q + 16))[0] = 0;
- }
-
- q += 32;
- pre += 32;
- dstv += 4 * stride;
- eobs += 2;
- }
-}
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ /dev/null
@@ -1,116 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
-
-void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,
- unsigned char *pre, unsigned char *dst,
- int dst_stride, short *dc);
-
-void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,
- unsigned char *pre, unsigned char *dst,
- int dst_stride, short *dc);
-
-void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,
- unsigned char *pre, unsigned char *dst,
- int dst_stride, int blk_stride);
-
-void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,
- unsigned char *pre, unsigned char *dst,
- int dst_stride, int blk_stride);
-
-void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dst,
- int stride, char *eobs, short *dc) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (((short *)(eobs))[0] & 0xfefe)
- vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);
- else
- vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);
-
- if (((short *)(eobs))[1] & 0xfefe)
- vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,
- stride, dc + 2);
- else
- vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,
- stride, dc + 2);
-
- q += 64;
- dc += 4;
- pre += 64;
- dst += stride * 4;
- eobs += 4;
- }
-}
-
-void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,
- unsigned char *pre, unsigned char *dst,
- int stride, char *eobs) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if (((short *)(eobs))[0] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);
- else
- vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);
-
- if (((short *)(eobs))[1] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
- else
- vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
-
- q += 64;
- pre += 64;
- dst += stride * 4;
- eobs += 4;
- }
-}
-
-void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,
- unsigned char *pre,
- unsigned char *dstu,
- unsigned char *dstv,
- int stride, char *eobs) {
- if (((short *)(eobs))[0] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
- else
- vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
- q += 32;
- pre += 32;
- dstu += stride * 4;
-
- if (((short *)(eobs))[1] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
- else
- vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
- q += 32;
- pre += 32;
-
- if (((short *)(eobs))[2] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
- else
- vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-
- q += 32;
- pre += 32;
- dstv += stride * 4;
-
- if (((short *)(eobs))[3] & 0xfefe)
- vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
- else
- vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-}
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ /dev/null
@@ -1,26 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/decoder/onyxd_int.h"
-
-#if HAVE_MMX
-void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp9_dequantize_b_mmx(BLOCKD *d) {
- short *sq = (short *) d->qcoeff;
- short *dq = (short *) d->dqcoeff;
- short *q = (short *) d->dequant;
- vp9_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
-
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ /dev/null
@@ -1,129 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/arm.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
-
-void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
- int flags = cpi->common.rtcd.flags;
-
-#if HAVE_ARMV5TE
- if (flags & HAS_EDSP) {
- }
-#endif
-
-#if HAVE_ARMV6
- if (flags & HAS_MEDIA) {
- cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6;
- /*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c;
- cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c;
- cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c;
- cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/
-
- /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
- cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6;
- /*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c;
- cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/
- cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6;
-
- /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
- cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6;
- /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
- cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
- cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6;
- cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6;
- cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6;
- cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6;
-
- cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6;
- /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
-
- cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6;
- cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6;
- cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6;
- cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6;
- cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6;
-
- /*cpi->rtcd.encodemb.berr = vp9_block_error_c;
- cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
- cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
- cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6;
- cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6;
- cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6;
-
- /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
- cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
- }
-#endif
-
-#if HAVE_ARMV7
- if (flags & HAS_NEON) {
- cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon;
- cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon;
- cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon;
- cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon;
- cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon;
-
- /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
- cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon;
- cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon;
- cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon;
- cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon;
-
- /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
- cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon;
- /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
- cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
- cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon;
- cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon;
- cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon;
- cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon;
-
- cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon;
- /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
-
- cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon;
- cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon;
- cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon;
- cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon;
- cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon;
-
- /*cpi->rtcd.encodemb.berr = vp9_block_error_c;
- cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
- cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
- cpi->rtcd.encodemb.subb = vp9_subtract_b_neon;
- cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon;
- cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon;
-
- /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
- cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
- cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
- cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
- }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (flags & HAS_NEON)
-#endif
- {
- vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
- }
-#endif
-#endif
-}
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_start_encode|
- EXPORT |vp9_encode_bool|
- EXPORT |vp8_stop_encode|
- EXPORT |vp8_encode_value|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 BOOL_CODER *br
-; r1 unsigned char *source
-
-|vp8_start_encode| PROC
- mov r12, #0
- mov r3, #255
- mvn r2, #23
- str r12, [r0, #vp9_writer_lowvalue]
- str r3, [r0, #vp9_writer_range]
- str r12, [r0, #vp9_writer_value]
- str r2, [r0, #vp9_writer_count]
- str r12, [r0, #vp9_writer_pos]
- str r1, [r0, #vp9_writer_buffer]
- bx lr
- ENDP
-
-; r0 BOOL_CODER *br
-; r1 int bit
-; r2 int probability
-|vp9_encode_bool| PROC
- push {r4-r9, lr}
-
- mov r4, r2
-
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
-
- sub r7, r5, #1 ; range-1
-
- cmp r1, #0
- mul r6, r4, r7 ; ((range-1) * probability)
-
- mov r7, #1
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
-
- addne r2, r2, r4 ; if (bit) lowvalue += split
- subne r4, r5, r4 ; if (bit) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r1, [r7, r4]
- cmpge r1, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r1, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r1, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- pop {r4-r9, pc}
- ENDP
-
-; r0 BOOL_CODER *br
-|vp8_stop_encode| PROC
- push {r4-r10, lr}
-
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
-
- mov r10, #32
-
-stop_encode_loop
- sub r7, r5, #1 ; range-1
-
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_se ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_se
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_se
-token_zero_while_loop_se
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_se
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r1, [r7, r4]
- cmpge r1, #0xff
- beq token_zero_while_loop_se
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_se
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r1, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r1, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r10, r10, #1
- bne stop_encode_loop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- pop {r4-r10, pc}
-
- ENDP
-
-; r0 BOOL_CODER *br
-; r1 int data
-; r2 int bits
-|vp8_encode_value| PROC
- push {r4-r11, lr}
-
- mov r10, r2
-
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
-
- rsb r4, r10, #32 ; 32-n
-
- ; v is kept in r1 during the token pack loop
- lsl r1, r1, r4 ; r1 = v << 32 - n
-
-encode_value_loop
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r1, r1, #1 ; bit = v >> n
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- addcs r2, r2, r4 ; if (bit) lowvalue += split
- subcs r4, r5, r4 ; if (bit) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_ev ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_ev
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_ev
-token_zero_while_loop_ev
- mov r9, #0
- strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_ev
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop_ev
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r9, [r7, r4] ; w->buffer[x]
- add r9, r9, #1
- strb r9, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_ev
- rsb r4, r6, #24 ; 24-offset
- ldr r9, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r9, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_ev
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r10, r10, #1
- bne encode_value_loop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- pop {r4-r11, pc}
- ENDP
-
- END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ /dev/null
@@ -1,291 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_armv5|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 vp9_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
- push {r4-r11, lr}
-
- ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
- ; sizeof (TOKENEXTRA) is 8
- sub sp, sp, #12
- add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
- str r2, [sp, #0]
- str r3, [sp, #8] ; save vp8_coef_encodings
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
- b check_p_lt_stop
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #8] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #52] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #52] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #48] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- add sp, sp, #12
- pop {r4-r11, pc}
- ENDP
-
- END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ /dev/null
@@ -1,327 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 vp9_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
- push {r4-r11, lr}
- sub sp, sp, #24
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r2, [sp, #20] ; save vp8_coef_encodings
- str r5, [sp, #12] ; save mb_rows
- str r3, [sp, #8] ; save vp8_extra_bits
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
-
- mov r0, r1 ; keep same as other loops
-
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actuall work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #20] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #60] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #60] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #8] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, #1
- add r7, r7, #TOKENLIST_SZ ; next element in the array
- str r6, [sp, #12]
- bne mb_row_loop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- add sp, sp, #24
- pop {r4-r11, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-
- END
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ /dev/null
@@ -1,465 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp9_tree_index *,
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
- push {r4-r11, lr}
- sub sp, sp, #44
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r5, [sp, #36] ; save mb_rows
- str r1, [sp, #24] ; save cx_data
- str r2, [sp, #20] ; save num_part
- str r3, [sp, #8] ; save *size
-
- ; *size = 3*(num_part -1 );
- sub r2, r2, #1 ; num_part - 1
- add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
- str r2, [r3]
-
- add r2, r2, r1 ; cx_data + *size
- str r2, [sp, #40] ; ptr
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
- str r7, [sp, #32] ; store start of cpi->tp_list
-
- ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
- add r0, r0, r11
-
- mov r11, #0
- str r11, [sp, #28] ; i
-
-numparts_loop
- ldr r10, [sp, #40] ; ptr
- ldr r5, [sp, #36] ; move mb_rows to the counting section
- sub r5, r5, r11 ; move start point with each partition
- ; mb_rows starts at i
- str r5, [sp, #12]
-
- ; Reset all of the VP8 Writer data for each partition that
- ; is processed.
- ; start_encode
- mov r2, #0 ; vp9_writer_lowvalue
- mov r5, #255 ; vp9_writer_range
- mvn r3, #23 ; vp9_writer_count
-
- str r2, [r0, #vp9_writer_value]
- str r2, [r0, #vp9_writer_pos]
- str r10, [r0, #vp9_writer_buffer]
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actual work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #80] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #88] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #88] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #84] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r10, [sp, #20] ; num_parts
- mov r1, #TOKENLIST_SZ
- mul r1, r10, r1
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, r10
- add r7, r7, r1 ; next element in the array
- str r6, [sp, #12]
- bgt mb_row_loop
-
- mov r12, #32
-
-stop_encode_loop
- sub r7, r5, #1 ; range-1
-
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_se ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_se
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_se
-token_zero_while_loop_se
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_se
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop_se
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_se
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r12, r12, #1
- bne stop_encode_loop
-
- ldr r10, [sp, #8] ; *size
- ldr r11, [r10]
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- add r11, r11, r4 ; *size += w->pos
- str r11, [r10]
-
- ldr r9, [sp, #20] ; num_parts
- sub r9, r9, #1
- ldr r10, [sp, #28] ; i
- cmp r10, r9 ; if(i<(num_part - 1))
- bge skip_write_partition
-
- ldr r12, [sp, #40] ; ptr
- add r12, r12, r4 ; ptr += w->pos
- str r12, [sp, #40]
-
- ldr r9, [sp, #24] ; cx_data
- mov r8, r4, asr #8
- strb r4, [r9, #0]
- strb r8, [r9, #1]
- mov r4, r4, asr #16
- strb r4, [r9, #2]
-
- add r9, r9, #3 ; cx_data += 3
- str r9, [sp, #24]
-
-skip_write_partition
-
- ldr r11, [sp, #28] ; i
- ldr r10, [sp, #20] ; num_parts
-
- add r11, r11, #1 ; i++
- str r11, [sp, #28]
-
- ldr r7, [sp, #32] ; cpi->tp_list[i]
- mov r1, #TOKENLIST_SZ
- add r7, r7, r1 ; next element in cpi->tp_list
- str r7, [sp, #32] ; cpi->tp_list[i+1]
-
- cmp r10, r11
- bgt numparts_loop
-
-
- add sp, sp, #44
- pop {r4-r11, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-_VP8_COMP_bc2_
- DCD vp8_comp_bc2
-
- END
--- a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_quantize_b_armv6|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *b
-; r1 BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
- stmfd sp!, {r1, r4-r11, lr}
-
- ldr r3, [r0, #vp8_block_coeff] ; coeff
- ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
- ldr r5, [r0, #vp8_block_round] ; round
- ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
- ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
- ldr r8, [r1, #vp8_blockd_dequant] ; dequant
-
- ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
- ; is used to update the counter so that
- ; it can be used to mark nonzero
- ; quantized coefficient pairs.
-
- mov r1, #0 ; flags for quantized coeffs
-
- ; PART 1: quantization and dequantization loop
-loop
- ldr r9, [r3], #4 ; [z1 | z0]
- ldr r10, [r5], #4 ; [r1 | r0]
- ldr r11, [r4], #4 ; [q1 | q0]
-
- ssat16 lr, #1, r9 ; [sz1 | sz0]
- eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
- ssub16 r9, r9, lr ; x = (z ^ sz) - sz
- sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
-
- ldr r12, [r3], #4 ; [z3 | z2]
-
- smulbb r0, r9, r11 ; [(x0+r0)*q0]
- smultt r9, r9, r11 ; [(x1+r1)*q1]
-
- ldr r10, [r5], #4 ; [r3 | r2]
-
- ssat16 r11, #1, r12 ; [sz3 | sz2]
- eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
- pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
- ldr r9, [r4], #4 ; [q3 | q2]
- ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
-
- sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
-
- eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
- smulbb r10, r12, r9 ; [(x2+r2)*q2]
- smultt r12, r12, r9 ; [(x3+r3)*q3]
-
- ssub16 r0, r0, lr ; x = (y ^ sz) - sz
-
- cmp r0, #0 ; check if zero
- orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
-
- str r0, [r6], #4 ; *qcoeff++ = x
- ldr r9, [r8], #4 ; [dq1 | dq0]
-
- pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
- eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
- ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
-
- cmp r10, #0 ; check if zero
- orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
-
- str r10, [r6], #4 ; *qcoeff++ = x
- ldr r11, [r8], #4 ; [dq3 | dq2]
-
- smulbb r12, r0, r9 ; [x0*dq0]
- smultt r0, r0, r9 ; [x1*dq1]
-
- smulbb r9, r10, r11 ; [x2*dq2]
- smultt r10, r10, r11 ; [x3*dq3]
-
- lsls r2, r2, #2 ; update loop counter
- strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
- strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
- strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
- strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
- add r7, r7, #8 ; dqcoeff += 8
- bne loop
-
- ; PART 2: check position for eob...
- mov lr, #0 ; init eob
- cmp r1, #0 ; coeffs after quantization?
- ldr r11, [sp, #0] ; restore BLOCKD pointer
- beq end ; skip eob calculations if all zero
-
- ldr r0, [r11, #vp8_blockd_qcoeff]
-
- ; check shortcut for nonzero qcoeffs
- tst r1, #0x80
- bne quant_coeff_15_14
- tst r1, #0x20
- bne quant_coeff_13_11
- tst r1, #0x8
- bne quant_coeff_12_7
- tst r1, #0x40
- bne quant_coeff_10_9
- tst r1, #0x10
- bne quant_coeff_8_3
- tst r1, #0x2
- bne quant_coeff_6_5
- tst r1, #0x4
- bne quant_coeff_4_2
- b quant_coeff_1_0
-
-quant_coeff_15_14
- ldrh r2, [r0, #30] ; rc=15, i=15
- mov lr, #16
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #28] ; rc=14, i=14
- mov lr, #15
- cmp r3, #0
- bne end
-
-quant_coeff_13_11
- ldrh r2, [r0, #22] ; rc=11, i=13
- mov lr, #14
- cmp r2, #0
- bne end
-
-quant_coeff_12_7
- ldrh r3, [r0, #14] ; rc=7, i=12
- mov lr, #13
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #20] ; rc=10, i=11
- mov lr, #12
- cmp r2, #0
- bne end
-
-quant_coeff_10_9
- ldrh r3, [r0, #26] ; rc=13, i=10
- mov lr, #11
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #24] ; rc=12, i=9
- mov lr, #10
- cmp r2, #0
- bne end
-
-quant_coeff_8_3
- ldrh r3, [r0, #18] ; rc=9, i=8
- mov lr, #9
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #12] ; rc=6, i=7
- mov lr, #8
- cmp r2, #0
- bne end
-
-quant_coeff_6_5
- ldrh r3, [r0, #6] ; rc=3, i=6
- mov lr, #7
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #4] ; rc=2, i=5
- mov lr, #6
- cmp r2, #0
- bne end
-
-quant_coeff_4_2
- ldrh r3, [r0, #10] ; rc=5, i=4
- mov lr, #5
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #16] ; rc=8, i=3
- mov lr, #4
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #8] ; rc=4, i=2
- mov lr, #3
- cmp r3, #0
- bne end
-
-quant_coeff_1_0
- ldrh r2, [r0, #2] ; rc=1, i=1
- mov lr, #2
- cmp r2, #0
- bne end
-
- mov lr, #1 ; rc=0, i=0
-
-end
- str lr, [r11, #vp8_blockd_eob]
- ldmfd sp!, {r1, r4-r11, pc}
-
- ENDP
-
-loop_count
- DCD 0x1000000
-
- END
-
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mse16x16_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
-; So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
- push {r4-r9, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov r4, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r5, [r0, #0x0] ; load 4 src pixels
- ldr r6, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r5, r6 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0x4] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r2, #0x4] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
- ldr r5, [r0, #0x8] ; load 4 src pixels
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r6, [r2, #0x8] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0xc] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r6, [r2, #0xc] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- subs r12, r12, #1 ; next row
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r1, [sp, #28] ; get address of sse
- mov r0, r4 ; return sse
- str r4, [r1] ; store sse
-
- pop {r4-r9, pc}
-
- ENDP
-
- END
--- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
+++ /dev/null
@@ -1,96 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad16x16_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 const unsigned char *src_ptr
-; r1 int src_stride
-; r2 const unsigned char *ref_ptr
-; r3 int ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- mov r4, #0 ; sad = 0;
- mov r5, #8 ; loop count
-
-loop
- ; 1st row
- ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
- ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
- ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
- ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
- ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
- ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
-
- usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
- usad8 r8, r7, r9 ; calculate sad for 4 pixels
-
- ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
- ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
-
- add r0, r0, r1 ; set src pointer to next row
- add r2, r2, r3 ; set dst pointer to next row
-
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
- usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
-
- ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
- ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
- add r4, r4, r8 ; add partial sad values
-
- ; 2nd row
- ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
- ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
- ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
- ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
-
- usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
- usad8 r8, r7, r9 ; calculate sad for 4 pixels
-
- ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
- ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
-
- add r0, r0, r1 ; set src pointer to next row
- add r2, r2, r3 ; set dst pointer to next row
-
- usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
- usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
-
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- subs r5, r5, #1 ; decrement loop counter
- add r4, r4, r8 ; add partial sad values
-
- bne loop
-
- mov r0, r4 ; return sad
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
- END
-
--- a/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
+++ /dev/null
@@ -1,262 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_short_fdct4x4_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_armv6| PROC
-
- stmfd sp!, {r4 - r12, lr}
-
- ; PART 1
-
- ; coeffs 0-3
- ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
-
- ldr r10, c7500
- ldr r11, c14500
- ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
- ldr lr, c0x00080008
- ror r5, r5, #16 ; [i2 | i3]
-
- qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
- qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
- smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
-
- smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
-
- pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
- pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
-
- str r6, [r1, #4]
-
- ; coeffs 4-7
- ror r9, r9, #16 ; [i6 | i7]
-
- qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
- qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
- smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
-
- smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
-
- pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
- pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
-
- str r6, [r1, #12]
-
- ; coeffs 8-11
- ror r5, r5, #16 ; [i10 | i11]
-
- qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
- qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
- smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
-
- smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
-
- pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
- pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
-
- str r6, [r1, #20]
-
- ; coeffs 12-15
- ror r5, r5, #16 ; [i14 | i15]
-
- qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
- qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
- smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
-
- smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
-
- pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
- pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
-
- str r6, [r1, #28]
-
-
- ; PART 2 -------------------------------------------------
- ldr r11, c12000
- ldr r10, c51000
- ldr lr, c0x00070007
-
- qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
- qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
- qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
- qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
-
- qadd16 r4, r4, lr ; a1 + 7
-
- add r0, r11, #0x10000 ; add (d!=0)
-
- qadd16 r2, r4, r5 ; a1 + b1 + 7
- qsub16 r3, r4, r5 ; a1 - b1 + 7
-
- ldr r12, c0x08a914e8 ; [2217 | 5352]
-
- lsl r8, r2, #16 ; prepare bottom halfword for scaling
- asr r2, r2, #4 ; scale top halfword
- lsl r9, r3, #16 ; prepare bottom halfword for scaling
- asr r3, r3, #4 ; scale top halfword
- pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
- pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
- smulbt r2, r6, r12 ; [ ------ | c1*2217]
- str r4, [r1, #0] ; [ o1 | o0]
- smultt r3, r6, r12 ; [c1*2217 | ------ ]
- str r5, [r1, #16] ; [ o9 | o8]
-
- smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
- smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
-
- smulbb r2, r6, r12 ; [ ------ | c1*5352]
- smultb r3, r6, r12 ; [c1*5352 | ------ ]
-
- lsls r6, r7, #16 ; d1 != 0 ?
- addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
- addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
- asrs r6, r7, #16
- addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
- addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
- smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
- smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
-
- pkhtb r9, r9, r8, asr #16
-
- sub r4, r4, r2
- sub r5, r5, r3
-
- ldr r3, [r1, #4] ; [i3 | i2]
-
- pkhtb r5, r5, r4, asr #16 ; [o13|o12]
-
- str r9, [r1, #8] ; [o5 | 04]
-
- ldr r9, [r1, #12] ; [i7 | i6]
- ldr r8, [r1, #28] ; [i15|i14]
- ldr r2, [r1, #20] ; [i11|i10]
- str r5, [r1, #24] ; [o13|o12]
-
- qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
- qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
-
- qadd16 r4, r4, lr ; a1 + 7
-
- qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
- qadd16 r2, r4, r5 ; a1 + b1 + 7
- qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
- qsub16 r3, r4, r5 ; a1 - b1 + 7
-
- lsl r8, r2, #16 ; prepare bottom halfword for scaling
- asr r2, r2, #4 ; scale top halfword
- lsl r9, r3, #16 ; prepare bottom halfword for scaling
- asr r3, r3, #4 ; scale top halfword
- pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
- pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
- smulbt r2, r6, r12 ; [ ------ | c1*2217]
- str r4, [r1, #4] ; [ o3 | o2]
- smultt r3, r6, r12 ; [c1*2217 | ------ ]
- str r5, [r1, #20] ; [ o11 | o10]
-
- smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
- smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
-
- smulbb r2, r6, r12 ; [ ------ | c1*5352]
- smultb r3, r6, r12 ; [c1*5352 | ------ ]
-
- lsls r6, r7, #16 ; d1 != 0 ?
- addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
- addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-
- asrs r6, r7, #16
- addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
- addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
- smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
- smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
-
- pkhtb r9, r9, r8, asr #16
-
- sub r4, r4, r2
- sub r5, r5, r3
-
- str r9, [r1, #12] ; [o7 | o6]
- pkhtb r5, r5, r4, asr #16 ; [o15|o14]
-
- str r5, [r1, #28] ; [o15|o14]
-
- ldmfd sp!, {r4 - r12, pc}
-
- ENDP
-
-; Used constants
-c7500
- DCD 7500
-c14500
- DCD 14500
-c0x22a453a0
- DCD 0x22a453a0
-c0x00080008
- DCD 0x00080008
-c12000
- DCD 12000
-c51000
- DCD 51000
-c0x00070007
- DCD 0x00070007
-c0x08a914e8
- DCD 0x08a914e8
-
- END
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ /dev/null
@@ -1,265 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_subtract_mby_armv6|
- EXPORT |vp8_subtract_mbuv_armv6|
- EXPORT |vp8_subtract_b_armv6|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *be
-; r1 BLOCKD *bd
-; r2 int pitch
-|vp8_subtract_b_armv6| PROC
-
- stmfd sp!, {r4-r9}
-
- ldr r4, [r0, #vp8_block_base_src]
- ldr r5, [r0, #vp8_block_src]
- ldr r6, [r0, #vp8_block_src_diff]
-
- ldr r3, [r4]
- ldr r7, [r0, #vp8_block_src_stride]
- add r3, r3, r5 ; src = *base_src + src
- ldr r8, [r1, #vp8_blockd_predictor]
-
- mov r9, #4 ; loop count
-
-loop_block
-
- ldr r0, [r3], r7 ; src
- ldr r1, [r8], r2 ; pred
-
- uxtb16 r4, r0 ; [s2 | s0]
- uxtb16 r5, r1 ; [p2 | p0]
- uxtb16 r0, r0, ror #8 ; [s3 | s1]
- uxtb16 r1, r1, ror #8 ; [p3 | p1]
-
- usub16 r4, r4, r5 ; [d2 | d0]
- usub16 r5, r0, r1 ; [d3 | d1]
-
- subs r9, r9, #1 ; decrement loop counter
-
- pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
- pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
-
- str r0, [r6, #0] ; diff
- str r1, [r6, #4] ; diff
-
- add r6, r6, r2, lsl #1 ; update diff pointer
- bne loop_block
-
- ldmfd sp!, {r4-r9}
- mov pc, lr
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *usrc
-; r2 unsigned char *vsrc
-; r3 unsigned char *pred
-; stack int stride
-|vp8_subtract_mbuv_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- add r0, r0, #512 ; set *diff point to Cb
- add r3, r3, #256 ; set *pred point to Cb
-
- mov r4, #8 ; loop count
- ldr r5, [sp, #40] ; stride
-
- ; Subtract U block
-loop_u
- ldr r6, [r1] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r1, r1, r5 ; update usrc pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_u
-
- mov r4, #8 ; loop count
-
- ; Subtract V block
-loop_v
- ldr r6, [r2] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r2, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r2, r2, r5 ; update vsrc pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_v
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *src
-; r2 unsigned char *pred
-; r3 int stride
-|vp8_subtract_mby_armv6| PROC
-
- stmfd sp!, {r4-r11}
-
- mov r4, #16
-loop
- ldr r6, [r1] ; src (A)
- ldr r7, [r2], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r2], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- ldr r10, [r1, #8] ; src (C)
- ldr r11, [r2], #4 ; pred (C)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- uxtb16 r8, r10 ; [s2 | s0] (C)
- str r9, [r0], #4 ; diff (B)
-
- uxtb16 r9, r11 ; [p2 | p0] (C)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
-
- usub16 r6, r8, r9 ; [d2 | d0] (C)
- usub16 r7, r10, r11 ; [d3 | d1] (C)
-
- ldr r10, [r1, #12] ; src (D)
- ldr r11, [r2], #4 ; pred (D)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
-
- str r8, [r0], #4 ; diff (C)
- uxtb16 r8, r10 ; [s2 | s0] (D)
- str r9, [r0], #4 ; diff (C)
-
- uxtb16 r9, r11 ; [p2 | p0] (D)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
-
- usub16 r6, r8, r9 ; [d2 | d0] (D)
- usub16 r7, r10, r11 ; [d3 | d1] (D)
-
- add r1, r1, r3 ; update src pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
- str r8, [r0], #4 ; diff (D)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (D)
-
- bne loop
-
- ldmfd sp!, {r4-r11}
- mov pc, lr
-
- ENDP
-
- END
-
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ /dev/null
@@ -1,154 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance16x16_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance16x16_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
-
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r5, [r2, #4] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r5, [r2, #8] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r5, [r2, #12] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
- END
-
--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+++ /dev/null
@@ -1,101 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance8x8_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance8x8_armv6| PROC
-
- push {r4-r10, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #8 ; set loop counter to 8 (=block height)
- mov r4, #0 ; initialize sum = 0
- mov r5, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r6, [r0, #0x0] ; load 4 src pixels
- ldr r7, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r6, r7 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r0, #0x4] ; load 4 src pixels
- ldr r7, [r2, #0x4] ; load 4 ref pixels
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r6, r7 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
-
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1 ; next row
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r8, [sp, #32] ; get address of sse
- mul r1, r4, r4 ; sum * sum
- str r5, [r8] ; store sse
- sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
-
- pop {r4-r10, pc}
-
- ENDP
-
- END
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ /dev/null
@@ -1,182 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_h_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ /dev/null
@@ -1,222 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_hv_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; pointer to pixels on the next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load source pixels a, row N
- ldr r6, [r0, #1] ; load source pixels b, row N
- ldr r5, [r9, #0] ; load source pixels c, row N+1
- ldr r7, [r9, #1] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #0] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load source pixels a, row N
- ldr r6, [r0, #5] ; load source pixels b, row N
- ldr r5, [r9, #4] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #5] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #4] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load source pixels a, row N
- ldr r6, [r0, #9] ; load source pixels b, row N
- ldr r5, [r9, #8] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #9] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #8] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load source pixels a, row N
- ldr r6, [r0, #13] ; load source pixels b, row N
- ldr r5, [r9, #12] ; load source pixels c, row N+1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
- ldr r7, [r9, #13] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #12] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ /dev/null
@@ -1,184 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_v_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; set src pointer to next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r9, #0] ; load 4 src pixels from next row
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r9, #4] ; load 4 src pixels from next row
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r9, #8] ; load 4 src pixels from next row
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r9, #12] ; load 4 src pixels from next row
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- a/vp8/encoder/arm/armv6/walsh_v6.asm
+++ /dev/null
@@ -1,212 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_short_walsh4x4_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY ; name this block of code
-
-;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
-; r0 short *input,
-; r1 short *output,
-; r2 int pitch
-|vp8_short_walsh4x4_armv6| PROC
-
- stmdb sp!, {r4 - r11, lr}
-
- ldrd r4, r5, [r0], r2
- ldr lr, c00040004
- ldrd r6, r7, [r0], r2
-
- ; 0-3
- qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
- qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
-
- ldrd r8, r9, [r0], r2
- ; 4-7
- qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
- qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
-
- ldrd r10, r11, [r0]
- ; 8-11
- qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
- qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
-
- ; 12-15
- qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
- qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
-
-
- lsls r2, r3, #16
- smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
- addne r11, r11, #1 ; A0 += (a1!=0)
-
- lsls r2, r7, #16
- smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
- addne r12, r12, #1 ; C0 += (a1!=0)
-
- add r0, r11, r12 ; a1_0 = A0 + C0
- sub r11, r11, r12 ; b1_0 = A0 - C0
-
- lsls r2, r5, #16
- smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
- addne r12, r12, #1 ; B0 += (a1!=0)
-
- lsls r2, r9, #16
- smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
- addne r2, r2, #1 ; D0 += (a1!=0)
-
- add lr, r12, r2 ; d1_0 = B0 + D0
- sub r12, r12, r2 ; c1_0 = B0 - D0
-
- ; op[0,4,8,12]
- adds r2, r0, lr ; a2 = a1_0 + d1_0
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- subs r0, r0, lr ; d2 = a1_0 - d1_0
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1] ; op[0]
-
- addmi r0, r0, #1 ; += a2 < 0
- add r0, r0, #3 ; += 3
- ldr lr, c00040004
- mov r0, r0, asr #3 ; >> 3
- strh r0, [r1, #24] ; op[12]
-
- adds r2, r11, r12 ; b2 = b1_0 + c1_0
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- subs r0, r11, r12 ; c2 = b1_0 - c1_0
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #8] ; op[4]
-
- addmi r0, r0, #1 ; += a2 < 0
- add r0, r0, #3 ; += 3
- smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
- smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
- mov r0, r0, asr #3 ; >> 3
- strh r0, [r1, #16] ; op[8]
-
-
- ; op[3,7,11,15]
- add r0, r3, r7 ; a1_3 = A3 + C3
- sub r3, r3, r7 ; b1_3 = A3 - C3
-
- smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
- smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
- add r7, r5, r9 ; d1_3 = B3 + D3
- sub r5, r5, r9 ; c1_3 = B3 - D3
-
- adds r2, r0, r7 ; a2 = a1_3 + d1_3
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- adds r9, r3, r5 ; b2 = b1_3 + c1_3
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #6] ; op[3]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- subs r2, r3, r5 ; c2 = b1_3 - c1_3
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #14] ; op[7]
-
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- subs r9, r0, r7 ; d2 = a1_3 - d1_3
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #22] ; op[11]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
- smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #30] ; op[15]
-
- ; op[1,5,9,13]
- add r0, r3, r5 ; a1_1 = A1 + C1
- sub r3, r3, r5 ; b1_1 = A1 - C1
-
- smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
- smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
- add r5, r7, r9 ; d1_1 = B1 + D1
- sub r7, r7, r9 ; c1_1 = B1 - D1
-
- adds r2, r0, r5 ; a2 = a1_1 + d1_1
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- adds r9, r3, r7 ; b2 = b1_1 + c1_1
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #2] ; op[1]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- subs r2, r3, r7 ; c2 = b1_1 - c1_1
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #10] ; op[5]
-
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- subs r9, r0, r5 ; d2 = a1_1 - d1_1
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #18] ; op[9]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
- smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #26] ; op[13]
-
-
- ; op[2,6,10,14]
- add r11, r4, r8 ; a1_2 = A2 + C2
- sub r12, r4, r8 ; b1_2 = A2 - C2
-
- smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
- smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
- add r4, r6, r10 ; d1_2 = B2 + D2
- sub r8, r6, r10 ; c1_2 = B2 - D2
-
- adds r2, r11, r4 ; a2 = a1_2 + d1_2
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- adds r9, r12, r8 ; b2 = b1_2 + c1_2
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #4] ; op[2]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- subs r2, r12, r8 ; c2 = b1_2 - c1_2
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #12] ; op[6]
-
- addmi r2, r2, #1 ; += a2 < 0
- add r2, r2, #3 ; += 3
- subs r9, r11, r4 ; d2 = a1_2 - d1_2
- mov r2, r2, asr #3 ; >> 3
- strh r2, [r1, #20] ; op[10]
-
- addmi r9, r9, #1 ; += a2 < 0
- add r9, r9, #3 ; += 3
- mov r9, r9, asr #3 ; >> 3
- strh r9, [r1, #28] ; op[14]
-
-
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_short_walsh4x4_armv6|
-
-c00040004
- DCD 0x00040004
-
- END
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ /dev/null
@@ -1,33 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/boolhuff.h"
-#include "vp8/common/blockd.h"
-
-const unsigned int vp9_prob_cost[256] = {
- 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
- 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
- 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
- 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
- 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
- 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
- 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
- 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
- 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
- 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
- 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
- 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
- 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
- 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
- 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
- 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
-};
-
--- a/vp8/encoder/arm/dct_arm.c
+++ /dev/null
@@ -1,21 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "./vpx_rtcd.h"
-
-#if HAVE_ARMV6
-
-void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
- vp9_short_fdct4x4_armv6(input, output, pitch);
- vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
-}
-
-#endif /* HAVE_ARMV6 */
--- a/vp8/encoder/arm/dct_arm.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DCT_ARM_H
-#define DCT_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_fdct(vp9_short_walsh4x4_armv6);
-extern prototype_fdct(vp9_short_fdct4x4_armv6);
-extern prototype_fdct(vp9_short_fdct8x4_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
-
-#undef vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
-
-#undef vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
-
-#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
-
-#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-extern prototype_fdct(vp9_short_fdct4x4_neon);
-extern prototype_fdct(vp9_short_fdct8x4_neon);
-extern prototype_fdct(vp8_fast_fdct4x4_neon);
-extern prototype_fdct(vp8_fast_fdct8x4_neon);
-extern prototype_fdct(vp9_short_walsh4x4_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
-
-#undef vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
-
-#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
-
-#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
-
-#undef vp8_fdct_walsh_short4x4
-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/encodemb_arm.h
+++ /dev/null
@@ -1,64 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef ENCODEMB_ARM_H
-#define ENCODEMB_ARM_H
-
-#if HAVE_ARMV6
-extern prototype_subb(vp9_subtract_b_armv6);
-extern prototype_submby(vp9_subtract_mby_armv6);
-extern prototype_submbuv(vp9_subtract_mbuv_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_encodemb_subb
-#define vp8_encodemb_subb vp9_subtract_b_armv6
-
-#undef vp8_encodemb_submby
-#define vp8_encodemb_submby vp9_subtract_mby_armv6
-
-#undef vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-// extern prototype_berr(vp9_block_error_c);
-// extern prototype_mberr(vp9_mbblock_error_c);
-// extern prototype_mbuverr(vp9_mbuverror_c);
-
-extern prototype_subb(vp9_subtract_b_neon);
-extern prototype_submby(vp9_subtract_mby_neon);
-extern prototype_submbuv(vp9_subtract_mbuv_neon);
-
-// #undef vp8_encodemb_berr
-// #define vp8_encodemb_berr vp9_block_error_c
-
-// #undef vp8_encodemb_mberr
-// #define vp8_encodemb_mberr vp9_mbblock_error_c
-
-// #undef vp8_encodemb_mbuverr
-// #define vp8_encodemb_mbuverr vp9_mbuverror_c
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_encodemb_subb
-#define vp8_encodemb_subb vp9_subtract_b_neon
-
-#undef vp8_encodemb_submby
-#define vp8_encodemb_submby vp9_subtract_mby_neon
-
-#undef vp8_encodemb_submbuv
-#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ /dev/null
@@ -1,261 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_quantize_b_neon|
- EXPORT |vp8_fast_quantize_b_pair_neon|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=4
-
-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
-|vp8_fast_quantize_b_pair_neon| PROC
-
- stmfd sp!, {r4-r9}
- vstmdb sp!, {q4-q7}
-
- ldr r4, [r0, #vp8_block_coeff]
- ldr r5, [r0, #vp8_block_quant_fast]
- ldr r6, [r0, #vp8_block_round]
-
- vld1.16 {q0, q1}, [r4@128] ; load z
-
- ldr r7, [r2, #vp8_blockd_qcoeff]
-
- vabs.s16 q4, q0 ; calculate x = abs(z)
- vabs.s16 q5, q1
-
- ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
- vshr.s16 q2, q0, #15 ; sz
- vshr.s16 q3, q1, #15
-
- vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
- vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
-
- ldr r4, [r1, #vp8_block_coeff]
-
- vadd.s16 q4, q6 ; x + Round
- vadd.s16 q5, q7
-
- vld1.16 {q0, q1}, [r4@128] ; load z2
-
- vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q5, q9
-
- vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
- vabs.s16 q11, q1
- vshr.s16 q12, q0, #15 ; sz2
- vshr.s16 q13, q1, #15
-
- ;modify data to have its original sign
- veor.s16 q4, q2 ; y^sz
- veor.s16 q5, q3
-
- vadd.s16 q10, q6 ; x2 + Round
- vadd.s16 q11, q7
-
- ldr r8, [r2, #vp8_blockd_dequant]
-
- vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q11, q9
-
- vshr.s16 q4, #1 ; right shift 1 after vqdmulh
- vshr.s16 q5, #1
-
- vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
-
- vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q5, q3
-
- vshr.s16 q10, #1 ; right shift 1 after vqdmulh
- vshr.s16 q11, #1
-
- ldr r9, [r2, #vp8_blockd_dqcoeff]
-
- veor.s16 q10, q12 ; y2^sz2
- veor.s16 q11, q13
-
- vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
-
-
- vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q11, q13
-
- ldr r6, [r3, #vp8_blockd_qcoeff]
-
- vmul.s16 q2, q6, q4 ; x * Dequant
- vmul.s16 q3, q7, q5
-
- ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
-
- vceq.s16 q8, q8 ; set q8 to all 1
-
- vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
-
- vmul.s16 q12, q6, q10 ; x2 * Dequant
- vmul.s16 q13, q7, q11
-
- vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
-
- vtst.16 q14, q4, q8 ; now find eob
- vtst.16 q15, q5, q8 ; non-zero element is set to all 1
-
- vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
-
- ldr r7, [r3, #vp8_blockd_dqcoeff]
-
- vand q0, q6, q14 ; get all valid numbers from scan array
- vand q1, q7, q15
-
- vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
-
- vtst.16 q2, q10, q8 ; now find eob
- vtst.16 q3, q11, q8 ; non-zero element is set to all 1
-
- vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
-
- vand q10, q6, q2 ; get all valid numbers from scan array
- vand q11, q7, q3
- vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
-
- vmax.u16 d0, d0, d1
- vmax.u16 d20, d20, d21
- vmovl.u16 q0, d0
- vmovl.u16 q10, d20
-
-
- vmax.u32 d0, d0, d1
- vmax.u32 d20, d20, d21
- vpmax.u32 d0, d0, d0
- vpmax.u32 d20, d20, d20
-
- add r4, r2, #vp8_blockd_eob
- add r5, r3, #vp8_blockd_eob
-
- vst1.32 {d0[0]}, [r4@32]
- vst1.32 {d20[0]}, [r5@32]
-
- vldmia sp!, {q4-q7}
- ldmfd sp!, {r4-r9}
- bx lr
-
- ENDP
-
-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-|vp8_fast_quantize_b_neon| PROC
-
- stmfd sp!, {r4-r7}
-
- ldr r3, [r0, #vp8_block_coeff]
- ldr r4, [r0, #vp8_block_quant_fast]
- ldr r5, [r0, #vp8_block_round]
-
- vld1.16 {q0, q1}, [r3@128] ; load z
- vorr.s16 q14, q0, q1 ; check if all zero (step 1)
- ldr r6, [r1, #vp8_blockd_qcoeff]
- ldr r7, [r1, #vp8_blockd_dqcoeff]
- vorr.s16 d28, d28, d29 ; check if all zero (step 2)
-
- vabs.s16 q12, q0 ; calculate x = abs(z)
- vabs.s16 q13, q1
-
- ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
- vshr.s16 q2, q0, #15 ; sz
- vmov r2, r3, d28 ; check if all zero (step 3)
- vshr.s16 q3, q1, #15
-
- vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
- vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
-
- vadd.s16 q12, q14 ; x + Round
- vadd.s16 q13, q15
-
- ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
-
- vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
- vqdmulh.s16 q13, q9
-
- vld1.16 {q10, q11}, [r0@128]; load inverse scan order
-
- vceq.s16 q8, q8 ; set q8 to all 1
-
- ldr r4, [r1, #vp8_blockd_dequant]
-
- vshr.s16 q12, #1 ; right shift 1 after vqdmulh
- vshr.s16 q13, #1
-
- orr r2, r2, r3 ; check if all zero (step 4)
- cmp r2, #0 ; check if all zero (step 5)
- beq zero_output ; check if all zero (step 6)
-
- ;modify data to have its original sign
- veor.s16 q12, q2 ; y^sz
- veor.s16 q13, q3
-
- vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
- vsub.s16 q13, q3
-
- vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
-
- vtst.16 q14, q12, q8 ; now find eob
- vtst.16 q15, q13, q8 ; non-zero element is set to all 1
-
- vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
-
- vand q10, q10, q14 ; get all valid numbers from scan array
- vand q11, q11, q15
-
-
- vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
- vmax.u16 d0, d0, d1
- vmovl.u16 q0, d0
-
- vmul.s16 q2, q12 ; x * Dequant
- vmul.s16 q3, q13
-
- vmax.u32 d0, d0, d1
- vpmax.u32 d0, d0, d0
-
- vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
-
- add r4, r1, #vp8_blockd_eob
- vst1.32 {d0[0]}, [r4@32]
-
- ldmfd sp!, {r4-r7}
- bx lr
-
-zero_output
- str r2, [r1, #vp8_blockd_eob]
- vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
- vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
-
- ldmfd sp!, {r4-r7}
- bx lr
-
- ENDP
-
-; default inverse zigzag table is defined in vp8/common/entropy.c
-_inv_zig_zag_
- DCD inv_zig_zag
-
- ALIGN 16 ; enable use of @128 bit aligned loads
-inv_zig_zag
- DCW 0x0001, 0x0002, 0x0006, 0x0007
- DCW 0x0003, 0x0005, 0x0008, 0x000d
- DCW 0x0004, 0x0009, 0x000c, 0x000e
- DCW 0x000a, 0x000b, 0x000f, 0x0010
-
- END
-
--- a/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,49 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "vp8/encoder/onyx_int.h"
-#include "vp8/encoder/quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/alloccommon.h"
-
-extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-
-
-void
-vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
- unsigned char *src_y, *dst_y;
- int yheight;
- int ystride;
- int border;
- int yoffset;
- int linestocopy;
-
- border = src_ybc->border;
- yheight = src_ybc->y_height;
- ystride = src_ybc->y_stride;
-
- linestocopy = (yheight >> (Fraction + 4));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
- yoffset = ystride * ((yheight >> 5) * 16 - 8);
- src_y = src_ybc->y_buffer + yoffset;
- dst_y = dst_ybc->y_buffer + yoffset;
-
- // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
- vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
-}
--- a/vp8/encoder/arm/neon/sad16_neon.asm
+++ /dev/null
@@ -1,207 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad16x16_neon|
- EXPORT |vp8_sad16x8_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int src_stride
-; r2 unsigned char *ref_ptr
-; r3 int ref_stride
-|vp8_sad16x16_neon| PROC
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0]
- vld1.8 {q7}, [r2]
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;==============================
-;unsigned int vp8_sad16x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-|vp8_sad16x8_neon| PROC
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/sad8_neon.asm
+++ /dev/null
@@ -1,209 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad8x8_neon|
- EXPORT |vp8_sad8x16_neon|
- EXPORT |vp8_sad4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; unsigned int vp8_sad8x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x8_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;============================
-;unsigned int vp8_sad8x16_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x16_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-;===========================
-;unsigned int vp8_sad4x4_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad4x4_neon| PROC
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 d1, d24
- vpaddl.u32 d0, d1
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null
@@ -1,221 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_fdct4x4_neon|
- EXPORT |vp8_short_fdct8x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
- ALIGN 16 ; enable use of @128 bit aligned loads
-coeff
- DCW 5352, 5352, 5352, 5352
- DCW 2217, 2217, 2217, 2217
- DCD 14500, 14500, 14500, 14500
- DCD 7500, 7500, 7500, 7500
- DCD 12000, 12000, 12000, 12000
- DCD 51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
- ; Part one
- vld1.16 {d0}, [r0@64], r2
- adr r12, coeff
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {d2}, [r0@64], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {d3}, [r0@64], r2
-
- ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
- vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
- vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q2, q2, #3 ; (a1, b1) << 3
- vshl.s16 q3, q3, #3 ; (c1, d1) << 3
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
- vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
-
- vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
- vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
- vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
-
- ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vmov.s16 d26, #7
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
- vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
- vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
- vadd.s16 d4, d4, d26 ; a1 + 7
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
- vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
-
- vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
- vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
-
- vceq.s16 d4, d7, #0
-
- vshr.s16 d0, d0, #4
- vshr.s16 d2, d2, #4
-
- vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
-
- vmvn.s16 d4, d4
- vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
- vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
- vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
- ; Part one
-
- vld1.16 {q0}, [r0@128], r2
- adr r12, coeff
- vld1.16 {q1}, [r0@128], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {q2}, [r0@128], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {q3}, [r0@128], r2
-
- ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
- vtrn.32 q0, q2 ; [A0|B0]
- vtrn.32 q1, q3 ; [A1|B1]
- vtrn.16 q0, q1 ; [A2|B2]
- vtrn.16 q2, q3 ; [A3|B3]
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
- vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
- vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q11, q11, #3 ; a1 << 3
- vshl.s16 q12, q12, #3 ; b1 << 3
- vshl.s16 q13, q13, #3 ; c1 << 3
- vshl.s16 q14, q14, #3 ; d1 << 3
-
- vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
- vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
-
- vmov.s16 q11, q9 ; 14500
- vmov.s16 q12, q10 ; 7500
-
- vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
- vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
- vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
- vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
-
- vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
- vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
- vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
- vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
-
- ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
- vtrn.32 q0, q2 ; q0=[A0 | B0]
- vtrn.32 q1, q3 ; q1=[A4 | B4]
- vtrn.16 q0, q1 ; q2=[A8 | B8]
- vtrn.16 q2, q3 ; q3=[A12|B12]
-
- vmov.s16 q15, #7
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
- vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
- vadd.s16 q11, q11, q15 ; a1 + 7
- vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 q0, q11, q12 ; a1 + b1 + 7
- vsub.s16 q1, q11, q12 ; a1 - b1 + 7
-
- vmov.s16 q11, q9 ; 12000
- vmov.s16 q12, q10 ; 51000
-
- vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
- vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
- vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
- vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
-
-
- vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
- vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
- vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
- vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
-
- vceq.s16 q14, q14, #0
-
- vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
- vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
-
- vmvn.s16 q14, q14
-
- vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
-
- vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
-
- vst1.16 {q0, q1}, [r1@128]! ; block A
- vst1.16 {q2, q3}, [r1@128]! ; block B
-
- bx lr
-
- ENDP
-
- END
-
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null
@@ -1,185 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_subtract_b_neon|
- EXPORT |vp8_subtract_mby_neon|
- EXPORT |vp8_subtract_mbuv_neon|
-
- INCLUDE asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
- stmfd sp!, {r4-r7}
-
- ldr r3, [r0, #vp8_block_base_src]
- ldr r4, [r0, #vp8_block_src]
- ldr r5, [r0, #vp8_block_src_diff]
- ldr r3, [r3]
- ldr r6, [r0, #vp8_block_src_stride]
- add r3, r3, r4 ; src = *base_src + src
- ldr r7, [r1, #vp8_blockd_predictor]
-
- vld1.8 {d0}, [r3], r6 ;load src
- vld1.8 {d1}, [r7], r2 ;load pred
- vld1.8 {d2}, [r3], r6
- vld1.8 {d3}, [r7], r2
- vld1.8 {d4}, [r3], r6
- vld1.8 {d5}, [r7], r2
- vld1.8 {d6}, [r3], r6
- vld1.8 {d7}, [r7], r2
-
- vsubl.u8 q10, d0, d1
- vsubl.u8 q11, d2, d3
- vsubl.u8 q12, d4, d5
- vsubl.u8 q13, d6, d7
-
- mov r2, r2, lsl #1
-
- vst1.16 {d20}, [r5], r2 ;store diff
- vst1.16 {d22}, [r5], r2
- vst1.16 {d24}, [r5], r2
- vst1.16 {d26}, [r5], r2
-
- ldmfd sp!, {r4-r7}
- bx lr
-
- ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
-|vp8_subtract_mby_neon| PROC
- mov r12, #4
-
-subtract_mby_loop
- vld1.8 {q0}, [r1], r3 ;load src
- vld1.8 {q1}, [r2]! ;load pred
- vld1.8 {q2}, [r1], r3
- vld1.8 {q3}, [r2]!
- vld1.8 {q4}, [r1], r3
- vld1.8 {q5}, [r2]!
- vld1.8 {q6}, [r1], r3
- vld1.8 {q7}, [r2]!
-
- vsubl.u8 q8, d0, d2
- vsubl.u8 q9, d1, d3
- vsubl.u8 q10, d4, d6
- vsubl.u8 q11, d5, d7
- vsubl.u8 q12, d8, d10
- vsubl.u8 q13, d9, d11
- vsubl.u8 q14, d12, d14
- vsubl.u8 q15, d13, d15
-
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
-
- subs r12, r12, #1
- bne subtract_mby_loop
-
- bx lr
- ENDP
-
-;=================================
-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-|vp8_subtract_mbuv_neon| PROC
- ldr r12, [sp]
-
-;u
- add r0, r0, #512 ; short *udiff = diff + 256;
- add r3, r3, #256 ; unsigned char *upred = pred + 256;
-
- vld1.8 {d0}, [r1], r12 ;load src
- vld1.8 {d1}, [r3]! ;load pred
- vld1.8 {d2}, [r1], r12
- vld1.8 {d3}, [r3]!
- vld1.8 {d4}, [r1], r12
- vld1.8 {d5}, [r3]!
- vld1.8 {d6}, [r1], r12
- vld1.8 {d7}, [r3]!
- vld1.8 {d8}, [r1], r12
- vld1.8 {d9}, [r3]!
- vld1.8 {d10}, [r1], r12
- vld1.8 {d11}, [r3]!
- vld1.8 {d12}, [r1], r12
- vld1.8 {d13}, [r3]!
- vld1.8 {d14}, [r1], r12
- vld1.8 {d15}, [r3]!
-
- vsubl.u8 q8, d0, d1
- vsubl.u8 q9, d2, d3
- vsubl.u8 q10, d4, d5
- vsubl.u8 q11, d6, d7
- vsubl.u8 q12, d8, d9
- vsubl.u8 q13, d10, d11
- vsubl.u8 q14, d12, d13
- vsubl.u8 q15, d14, d15
-
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
-
-;v
- vld1.8 {d0}, [r2], r12 ;load src
- vld1.8 {d1}, [r3]! ;load pred
- vld1.8 {d2}, [r2], r12
- vld1.8 {d3}, [r3]!
- vld1.8 {d4}, [r2], r12
- vld1.8 {d5}, [r3]!
- vld1.8 {d6}, [r2], r12
- vld1.8 {d7}, [r3]!
- vld1.8 {d8}, [r2], r12
- vld1.8 {d9}, [r3]!
- vld1.8 {d10}, [r2], r12
- vld1.8 {d11}, [r3]!
- vld1.8 {d12}, [r2], r12
- vld1.8 {d13}, [r3]!
- vld1.8 {d14}, [r2], r12
- vld1.8 {d15}, [r3]!
-
- vsubl.u8 q8, d0, d1
- vsubl.u8 q9, d2, d3
- vsubl.u8 q10, d4, d5
- vsubl.u8 q11, d6, d7
- vsubl.u8 q12, d8, d9
- vsubl.u8 q13, d10, d11
- vsubl.u8 q14, d12, d13
- vsubl.u8 q15, d14, d15
-
- vst1.16 {q8}, [r0]! ;store diff
- vst1.16 {q9}, [r0]!
- vst1.16 {q10}, [r0]!
- vst1.16 {q11}, [r0]!
- vst1.16 {q12}, [r0]!
- vst1.16 {q13}, [r0]!
- vst1.16 {q14}, [r0]!
- vst1.16 {q15}, [r0]!
-
- bx lr
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/variance_neon.asm
+++ /dev/null
@@ -1,276 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance16x16_neon|
- EXPORT |vp9_variance16x8_neon|
- EXPORT |vp9_variance8x16_neon|
- EXPORT |vp9_variance8x8_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance16x16_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #8
-
-variance16x16_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
- ;the results into the elements of the destination vector. The explanation
- ;in ARM guide is wrong.
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- ;vmov.32 r0, d0[0] ;this instruction costs a lot
- ;vmov.32 r1, d1[0]
- ;mul r0, r0, r0
- ;str r1, [r12]
- ;sub r0, r1, r0, asr #8
-
- ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
- ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;================================
-;unsigned int vp9_variance16x8_c(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *sse)
-|vp9_variance16x8_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #4
-
-variance16x8_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance16x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #7
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;=================================
-;unsigned int vp9_variance8x16_c(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *sse)
-
-|vp9_variance8x16_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #8
-
-variance8x16_neon_loop
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d2, d6
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
-
- bne variance8x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #7
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
-;==================================
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance8x8_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-variance8x8_neon_loop
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d1}, [r0], r1
- vld1.8 {d5}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
- vld1.8 {d3}, [r0], r1
- vld1.8 {d7}, [r2], r3
-
- vsubl.u8 q11, d0, d4 ;calculate diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;calculate sum
- vmlal.s16 q9, d22, d22 ;calculate sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- ldr r12, [sp] ;load *sse from stack
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r12] ;store sse
- vshr.s32 d10, d10, #6
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- bx lr
-
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,68 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_memcpy_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-|vp8_memcpy_neon| PROC
- ;pld [r1] ;preload pred data
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- mov r12, r2, lsr #8 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
- vld1.8 {q0, q1}, [r1]! ;load src data
- subs r12, r12, #1
- vld1.8 {q2, q3}, [r1]!
- vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
- vld1.8 {q4, q5}, [r1]!
- vst1.8 {q2, q3}, [r0]!
- vld1.8 {q6, q7}, [r1]!
- vst1.8 {q4, q5}, [r0]!
- vld1.8 {q8, q9}, [r1]!
- vst1.8 {q6, q7}, [r0]!
- vld1.8 {q10, q11}, [r1]!
- vst1.8 {q8, q9}, [r0]!
- vld1.8 {q12, q13}, [r1]!
- vst1.8 {q10, q11}, [r0]!
- vld1.8 {q14, q15}, [r1]!
- vst1.8 {q12, q13}, [r0]!
- vst1.8 {q14, q15}, [r0]!
-
- ;pld [r1] ;preload pred data -- need to adjust for real device
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- bne memcpy_neon_loop
-
- ands r3, r2, #0xff ;extra copy
- beq done_copy_neon_loop
-
-extra_copy_neon_loop
- vld1.8 {q0}, [r1]! ;load src data
- subs r3, r3, #16
- vst1.8 {q0}, [r0]!
- bne extra_copy_neon_loop
-
-done_copy_neon_loop
- bx lr
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ /dev/null
@@ -1,116 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mse16x16_neon|
- EXPORT |vp8_get4x4sse_cs_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;============================
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-;note: in this function, sum is never used. So, we can remove this part of calculation
-;from vp9_variance().
-
-|vp8_mse16x16_neon| PROC
- vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
- vmov.i8 q8, #0
- vmov.i8 q9, #0
- vmov.i8 q10, #0
-
- mov r12, #8
-
-mse16x16_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vmlal.s16 q7, d22, d22
- vmlal.s16 q8, d23, d23
-
- subs r12, r12, #1
-
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vmlal.s16 q7, d26, d26
- vmlal.s16 q8, d27, d27
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne mse16x16_neon_loop
-
- vadd.u32 q7, q7, q8
- vadd.u32 q9, q9, q10
-
- ldr r12, [sp] ;load *sse from stack
-
- vadd.u32 q10, q7, q9
- vpaddl.u32 q1, q10
- vadd.u64 d0, d2, d3
-
- vst1.32 {d0[0]}, [r12]
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-
-;=============================
-; r0 unsigned char *src_ptr,
-; r1 int source_stride,
-; r2 unsigned char *ref_ptr,
-; r3 int recon_stride
-|vp8_get4x4sse_cs_neon| PROC
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d1}, [r0], r1
- vld1.8 {d5}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
- vld1.8 {d3}, [r0], r1
- vld1.8 {d7}, [r2], r3
-
- vsubl.u8 q11, d0, d4
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vmull.s16 q7, d22, d22
- vmull.s16 q8, d24, d24
- vmull.s16 q9, d26, d26
- vmull.s16 q10, d28, d28
-
- vadd.u32 q7, q7, q8
- vadd.u32 q9, q9, q10
- vadd.u32 q9, q7, q9
-
- vpaddl.u32 q1, q9
- vadd.u64 d0, d2, d3
-
- vmov.32 r0, d0[0]
- bx lr
-
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null
@@ -1,103 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_walsh4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0 short *input,
-; r1 short *output,
-; r2 int pitch
-|vp8_short_walsh4x4_neon| PROC
-
- vld1.16 {d0}, [r0@64], r2 ; load input
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {d2}, [r0@64], r2
- vld1.16 {d3}, [r0@64]
-
- ;First for-loop
- ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
-
- vmov.s32 q15, #3 ; add 3 to all values
-
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
- vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
- vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
- vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
-
- vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
- vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
- vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
- vceq.s16 d16, d4, #0 ; a1 == 0
- vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
-
- vadd.s16 d0, d4, d5 ; a1 + d1
- vmvn d16, d16 ; a1 != 0
- vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
- vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
- vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
- vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
-
- ;Second for-loop
- ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d1, d3
- vtrn.32 d0, d2
- vtrn.16 d2, d3
- vtrn.16 d0, d1
-
- vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
- vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
- vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
- vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
-
- vadd.s32 q0, q8, q9 ; a2 = a1 + d1
- vadd.s32 q1, q11, q10 ; b2 = b1 + c1
- vsub.s32 q2, q11, q10 ; c2 = b1 - c1
- vsub.s32 q3, q8, q9 ; d2 = a1 - d1
-
- vclt.s32 q8, q0, #0
- vclt.s32 q9, q1, #0
- vclt.s32 q10, q2, #0
- vclt.s32 q11, q3, #0
-
- ; subtract -1 (or 0)
- vsub.s32 q0, q0, q8 ; a2 += a2 < 0
- vsub.s32 q1, q1, q9 ; b2 += b2 < 0
- vsub.s32 q2, q2, q10 ; c2 += c2 < 0
- vsub.s32 q3, q3, q11 ; d2 += d2 < 0
-
- vadd.s32 q8, q0, q15 ; a2 + 3
- vadd.s32 q9, q1, q15 ; b2 + 3
- vadd.s32 q10, q2, q15 ; c2 + 3
- vadd.s32 q11, q3, q15 ; d2 + 3
-
- ; vrshrn? would add 1 << 3-1 = 2
- vshrn.s32 d0, q8, #3
- vshrn.s32 d1, q9, #3
- vshrn.s32 d2, q10, #3
- vshrn.s32 d3, q11, #3
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_sub_pixel_variance16x16_neon_func|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
-
-|vp9_sub_pixel_variance16x16_neon_func| PROC
- push {r4-r6, lr}
-
- ldr r12, _BilinearTaps_coeff_
- ldr r4, [sp, #16] ;load *dst_ptr from stack
- ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
- ldr r6, [sp, #24] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- sub sp, sp, #256
- mov r3, sp
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5}, [r3]!
- vst1.u8 {d6, d7}, [r3]!
- vmov q11, q15
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_sp16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r3]! ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r3]!
- vst1.u8 {d18, d19}, [r3]!
- vst1.u8 {d20, d21}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
- mov r3, sp
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r3]!
- vmov q11, q15
- vst1.u8 {d6, d7}, [r3]!
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r12, #8
-
-sub_pixel_variance16x16_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q2}, [r4], r5
- vld1.8 {q1}, [r3]!
- vld1.8 {q3}, [r4], r5
-
- vsubl.u8 q11, d0, d4 ;diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne sub_pixel_variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r6] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- add sp, sp, #528
- vmov.32 r0, d0[0] ;return
-
- pop {r4-r6,pc}
-
- ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
- DCD bilinear_taps_coeff
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,572 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_h_neon|
- EXPORT |vp9_variance_halfpixvar16x16_v_neon|
- EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
- EXPORT |vp9_sub_pixel_variance16x16s_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_h_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_h_neon| PROC
- push {lr}
-
- mov r12, #4 ;loop counter
- ldr lr, [sp, #4] ;load *sse from stack
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.8 {q11}, [r2], r3
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.8 {q12}, [r2], r3
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.8 {q13}, [r2], r3
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vext.8 q3, q2, q3, #1
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vld1.8 {q14}, [r2], r3
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
-
- vsubl.u8 q4, d0, d22 ;diff
- vsubl.u8 q5, d1, d23
- vsubl.u8 q6, d2, d24
- vsubl.u8 q7, d3, d25
- vsubl.u8 q0, d4, d26
- vsubl.u8 q1, d5, d27
- vsubl.u8 q2, d6, d28
- vsubl.u8 q3, d7, d29
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_fpo16x16s_4_0_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_v_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_v_neon| PROC
- push {lr}
-
- mov r12, #4 ;loop counter
-
- vld1.u8 {q0}, [r0], r1 ;load src data
- ldr lr, [sp, #4] ;load *sse from stack
-
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
- vld1.u8 {q2}, [r0], r1
- vld1.8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vld1.8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vld1.8 {q5}, [r2], r3
- vld1.u8 {q15}, [r0], r1
-
- vrhadd.u8 q0, q0, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q4
- vrhadd.u8 q4, q4, q6
- vrhadd.u8 q6, q6, q15
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
-
- vmov q0, q15
-
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_spo16x16s_0_4_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_hv_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_hv_neon| PROC
- push {lr}
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
-
- ldr lr, [sp, #4] ;load *sse from stack
- vmov.i8 q13, #0 ;q8 - sum
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
-
- vmov.i8 q14, #0 ;q9, q10 - sse
- vmov.i8 q15, #0
-
- mov r12, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vld1.8 {q5}, [r2], r3
- vrhadd.u8 q0, q0, q1
- vld1.8 {q6}, [r2], r3
- vrhadd.u8 q1, q1, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q3
- vld1.8 {q8}, [r2], r3
- vrhadd.u8 q3, q3, q4
-
- vsubl.u8 q9, d0, d10 ;diff
- vsubl.u8 q10, d1, d11
- vsubl.u8 q11, d2, d12
- vsubl.u8 q12, d3, d13
-
- vsubl.u8 q0, d4, d14 ;diff
- vsubl.u8 q1, d5, d15
- vsubl.u8 q5, d6, d16
- vsubl.u8 q6, d7, d17
-
- vpadal.s16 q13, q9 ;sum
- vmlal.s16 q14, d18, d18 ;sse
- vmlal.s16 q15, d19, d19
-
- vpadal.s16 q13, q10 ;sum
- vmlal.s16 q14, d20, d20 ;sse
- vmlal.s16 q15, d21, d21
-
- vpadal.s16 q13, q11 ;sum
- vmlal.s16 q14, d22, d22 ;sse
- vmlal.s16 q15, d23, d23
-
- vpadal.s16 q13, q12 ;sum
- vmlal.s16 q14, d24, d24 ;sse
- vmlal.s16 q15, d25, d25
-
- subs r12, r12, #1
-
- vpadal.s16 q13, q0 ;sum
- vmlal.s16 q14, d0, d0 ;sse
- vmlal.s16 q15, d1, d1
-
- vpadal.s16 q13, q1 ;sum
- vmlal.s16 q14, d2, d2 ;sse
- vmlal.s16 q15, d3, d3
-
- vpadal.s16 q13, q5 ;sum
- vmlal.s16 q14, d10, d10 ;sse
- vmlal.s16 q15, d11, d11
-
- vmov q0, q4
-
- vpadal.s16 q13, q6 ;sum
- vmlal.s16 q14, d12, d12 ;sse
- vmlal.s16 q15, d13, d13
-
- bne vp8_filt16x16s_4_4_loop_neon
-
- vadd.u32 q15, q14, q15 ;accumulate sse
- vpaddl.s32 q0, q13 ;accumulate sum
-
- vpaddl.u32 q1, q15
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;==============================
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp9_sub_pixel_variance16x16s_neon| PROC
- push {r4, lr}
-
- ldr r4, [sp, #8] ;load *dst_ptr from stack
- ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #16] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16s_only
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq firstpass_bfilter16x16s_only
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- mov r3, sp
- mov r2, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q1, q1, q2
- vrhadd.u8 q2, q2, q3
- vrhadd.u8 q3, q3, q4
-
- subs r2, r2, #1
- vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
- vmov q0, q4
- vst1.u8 {d4, d5, d6, d7}, [r3]!
-
- bne vp8e_filt_blk2d_fp16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
- mov r2, #2 ;loop counter
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
- vext.8 q3, q2, q3, #1
- vld1.u8 {d20, d21, d22, d23}, [r0], r1
- vext.8 q5, q4, q5, #1
- vld1.u8 {d24, d25, d26, d27}, [r0], r1
- vext.8 q7, q6, q7, #1
- vld1.u8 {d28, d29, d30, d31}, [r0], r1
- vext.8 q9, q8, q9, #1
- vext.8 q11, q10, q11, #1
- vext.8 q13, q12, q13, #1
- vext.8 q15, q14, q15, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
- vrhadd.u8 q5, q10, q11
- vrhadd.u8 q6, q12, q13
- vrhadd.u8 q7, q14, q15
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]!
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
- sub sp, sp, #256 ;reserve space on stack for temporary storage
-
- mov r2, #2 ;loop counter
- vld1.u8 {d0, d1}, [r0], r1 ;load src data
- mov r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
- vld1.u8 {d2, d3}, [r0], r1
- vld1.u8 {d4, d5}, [r0], r1
- vld1.u8 {d6, d7}, [r0], r1
- vld1.u8 {d8, d9}, [r0], r1
-
- vrhadd.u8 q0, q0, q1
- vld1.u8 {d10, d11}, [r0], r1
- vrhadd.u8 q1, q1, q2
- vld1.u8 {d12, d13}, [r0], r1
- vrhadd.u8 q2, q2, q3
- vld1.u8 {d14, d15}, [r0], r1
- vrhadd.u8 q3, q3, q4
- vld1.u8 {d16, d17}, [r0], r1
- vrhadd.u8 q4, q4, q5
- vrhadd.u8 q5, q5, q6
- vrhadd.u8 q6, q6, q7
- vrhadd.u8 q7, q7, q8
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vmov q0, q8
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r2, #4
-
-sub_pixel_variance16x16s_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q1}, [r4], r12
- vld1.8 {q2}, [r3]!
- vld1.8 {q3}, [r4], r12
- vld1.8 {q4}, [r3]!
- vld1.8 {q5}, [r4], r12
- vld1.8 {q6}, [r3]!
- vld1.8 {q7}, [r4], r12
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r2, r2, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne sub_pixel_variance16x16s_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- add sp, sp, #256
- vmov.32 r0, d0[0] ;return
-
- pop {r4, pc}
- ENDP
-
- END
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_sub_pixel_variance8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
-
-|vp9_sub_pixel_variance8x8_neon| PROC
- push {r4-r5, lr}
-
- ldr r12, _BilinearTaps_coeff_
- ldr r4, [sp, #12] ;load *dst_ptr from stack
- ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #20] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- ;skip_secondpass_filter
- beq sub_pixel_variance8x8_neon
-
- add r3, r12, r3, lsl #3
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q2, #7
- vqrshrn.u16 d24, q3, #7
- vqrshrn.u16 d25, q4, #7
- vqrshrn.u16 d26, q5, #7
- vqrshrn.u16 d27, q6, #7
- vqrshrn.u16 d28, q7, #7
- vqrshrn.u16 d29, q8, #7
-
- b sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;----------------------
-;vp9_variance8x8_neon
-sub_pixel_variance8x8_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-sub_pixel_variance8x8_neon_loop
- vld1.8 {d0}, [r4], r5 ;load dst data
- subs r12, r12, #1
- vld1.8 {d1}, [r4], r5
- vld1.8 {d2}, [r4], r5
- vsubl.u8 q4, d22, d0 ;calculate diff
- vld1.8 {d3}, [r4], r5
-
- vsubl.u8 q5, d23, d1
- vsubl.u8 q6, d24, d2
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- vsubl.u8 q7, d25, d3
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
-
- vmov q11, q13
-
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
-
- vmov q12, q14
-
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- bne sub_pixel_variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #6
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {r4-r5, pc}
-
- ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
- DCD bilinear_taps_coeff
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp8/encoder/arm/quantize_arm.c
+++ /dev/null
@@ -1,59 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_ARMV7
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x) {
- int i;
- int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
- && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
- for (i = 0; i < 16; i += 2)
- x->quantize_b_pair(&x->block[i], &x->block[i + 1],
- &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-
- if (has_2nd_order)
- x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x) {
- int i;
- int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
- && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
- for (i = 0; i < 24; i += 2)
- x->quantize_b_pair(&x->block[i], &x->block[i + 1],
- &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-
- if (has_2nd_order)
- x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
- int i;
-
- for (i = 16; i < 24; i += 2)
- x->quantize_b_pair(&x->block[i], &x->block[i + 1],
- &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
-}
-
-#endif /* HAVE_ARMV7 */
--- a/vp8/encoder/arm/quantize_arm.h
+++ /dev/null
@@ -1,52 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef QUANTIZE_ARM_H
-#define QUANTIZE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
-#endif
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-
-extern prototype_quantize_block(vp8_fast_quantize_b_neon);
-extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_quantize_fastquantb
-#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
-
-#undef vp8_quantize_fastquantb_pair
-#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
-
-#undef vp8_quantize_mb
-#define vp8_quantize_mb vp8_quantize_mb_neon
-
-#undef vp8_quantize_mbuv
-#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
-
-#undef vp8_quantize_mby
-#define vp8_quantize_mby vp8_quantize_mby_neon
-#endif
-
-#endif /* HAVE_ARMV7 */
-
-#endif
-
--- a/vp8/encoder/arm/variance_arm.c
+++ /dev/null
@@ -1,112 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/filter.h"
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-#define HALFNDX 8
-
-#if HAVE_ARMV6
-
-unsigned int vp9_sub_pixel_variance8x8_armv6
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- unsigned short first_pass[10 * 8];
- unsigned char second_pass[8 * 8];
- const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
- src_pixels_per_line,
- 9, 8, HFilter);
- vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
- 8, 8, 8, VFilter);
-
- return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,
- dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_armv6
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- unsigned short first_pass[36 * 16];
- unsigned char second_pass[20 * 16];
- const short *HFilter, *VFilter;
- unsigned int var;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- } else {
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
- src_pixels_per_line,
- 17, 16, HFilter);
- vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
- 16, 16, 16, VFilter);
-
- var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,
- dst_pixels_per_line, sse);
- }
- return var;
-}
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-
-unsigned int vp9_sub_pixel_variance16x16_neon
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- if (xoffset == HALFNDX && yoffset == 0)
- return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else if (xoffset == 0 && yoffset == HALFNDX)
- return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else if (xoffset == HALFNDX && yoffset == HALFNDX)
- return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else
- return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif
--- a/vp8/encoder/arm/variance_arm.h
+++ /dev/null
@@ -1,132 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_ARM_H
-#define VARIANCE_ARM_H
-
-#if HAVE_ARMV6
-
-extern prototype_sad(vp9_sad16x16_armv6);
-extern prototype_variance(vp9_variance16x16_armv6);
-extern prototype_variance(vp9_variance8x8_armv6);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);
-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);
-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);
-extern prototype_variance(vp9_mse16x16_armv6);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_variance_sad16x16
-#define vp9_variance_sad16x16 vp9_sad16x16_armv6
-
-#undef vp9_variance_subpixvar16x16
-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6
-
-#undef vp9_variance_subpixvar8x8
-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6
-
-#undef vp9_variance_var16x16
-#define vp9_variance_var16x16 vp9_variance16x16_armv6
-
-#undef vp9_variance_mse16x16
-#define vp9_variance_mse16x16 vp9_mse16x16_armv6
-
-#undef vp9_variance_var8x8
-#define vp9_variance_var8x8 vp9_variance8x8_armv6
-
-#undef vp9_variance_halfpixvar16x16_h
-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6
-
-#undef vp9_variance_halfpixvar16x16_v
-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6
-
-#undef vp9_variance_halfpixvar16x16_hv
-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_ARMV6 */
-
-
-#if HAVE_ARMV7
-extern prototype_sad(vp9_sad4x4_neon);
-extern prototype_sad(vp9_sad8x8_neon);
-extern prototype_sad(vp9_sad8x16_neon);
-extern prototype_sad(vp9_sad16x8_neon);
-extern prototype_sad(vp9_sad16x16_neon);
-
-extern prototype_variance(vp9_variance8x8_neon);
-extern prototype_variance(vp9_variance8x16_neon);
-extern prototype_variance(vp9_variance16x8_neon);
-extern prototype_variance(vp9_variance16x16_neon);
-
-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);
-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);
-extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);
-extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);
-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);
-
-extern prototype_variance(vp9_mse16x16_neon);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_variance_sad4x4
-#define vp9_variance_sad4x4 vp9_sad4x4_neon
-
-#undef vp9_variance_sad8x8
-#define vp9_variance_sad8x8 vp9_sad8x8_neon
-
-#undef vp9_variance_sad8x16
-#define vp9_variance_sad8x16 vp9_sad8x16_neon
-
-#undef vp9_variance_sad16x8
-#define vp9_variance_sad16x8 vp9_sad16x8_neon
-
-#undef vp9_variance_sad16x16
-#define vp9_variance_sad16x16 vp9_sad16x16_neon
-
-#undef vp9_variance_var8x8
-#define vp9_variance_var8x8 vp9_variance8x8_neon
-
-#undef vp9_variance_var8x16
-#define vp9_variance_var8x16 vp9_variance8x16_neon
-
-#undef vp9_variance_var16x8
-#define vp9_variance_var16x8 vp9_variance16x8_neon
-
-#undef vp9_variance_var16x16
-#define vp9_variance_var16x16 vp9_variance16x16_neon
-
-#undef vp9_variance_subpixvar8x8
-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon
-
-#undef vp9_variance_subpixvar16x16
-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon
-
-#undef vp9_variance_halfpixvar16x16_h
-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon
-
-#undef vp9_variance_halfpixvar16x16_v
-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon
-
-#undef vp9_variance_halfpixvar16x16_hv
-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon
-
-#undef vp9_variance_mse16x16
-#define vp9_variance_mse16x16 vp9_mse16x16_neon
-
-#endif
-
-#endif
-
-#endif
--- a/vp8/encoder/asm_enc_offsets.c
+++ /dev/null
@@ -1,90 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "block.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "tokenize.h"
-
-BEGIN
-
-/* regular quantize */
-DEFINE(vp9_block_coeff, offsetof(BLOCK, coeff));
-DEFINE(vp9_block_zbin, offsetof(BLOCK, zbin));
-DEFINE(vp9_block_round, offsetof(BLOCK, round));
-DEFINE(vp9_block_quant, offsetof(BLOCK, quant));
-DEFINE(vp9_block_quant_fast, offsetof(BLOCK, quant_fast));
-DEFINE(vp9_block_zbin_extra, offsetof(BLOCK, zbin_extra));
-DEFINE(vp9_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp9_block_quant_shift, offsetof(BLOCK, quant_shift));
-
-DEFINE(vp9_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
-DEFINE(vp9_blockd_dequant, offsetof(BLOCKD, dequant));
-DEFINE(vp9_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
-DEFINE(vp9_blockd_eob, offsetof(BLOCKD, eob));
-
-/* subtract */
-DEFINE(vp9_block_base_src, offsetof(BLOCK, base_src));
-DEFINE(vp9_block_src, offsetof(BLOCK, src));
-DEFINE(vp9_block_src_diff, offsetof(BLOCK, src_diff));
-DEFINE(vp9_block_src_stride, offsetof(BLOCK, src_stride));
-
-DEFINE(vp9_blockd_predictor, offsetof(BLOCKD, predictor));
-
-/* pack tokens */
-DEFINE(vp9_writer_lowvalue, offsetof(vp9_writer, lowvalue));
-DEFINE(vp9_writer_range, offsetof(vp9_writer, range));
-DEFINE(vp9_writer_value, offsetof(vp9_writer, value));
-DEFINE(vp9_writer_count, offsetof(vp9_writer, count));
-DEFINE(vp9_writer_pos, offsetof(vp9_writer, pos));
-DEFINE(vp9_writer_buffer, offsetof(vp9_writer, buffer));
-
-DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA));
-
-DEFINE(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct));
-
-DEFINE(vp9_token_value, offsetof(vp9_token, value));
-DEFINE(vp9_token_len, offsetof(vp9_token, Len));
-
-DEFINE(vp9_extra_bit_struct_tree, offsetof(vp9_extra_bit_struct, tree));
-DEFINE(vp9_extra_bit_struct_prob, offsetof(vp9_extra_bit_struct, prob));
-DEFINE(vp9_extra_bit_struct_len, offsetof(vp9_extra_bit_struct, Len));
-DEFINE(vp9_extra_bit_struct_base_val, offsetof(vp9_extra_bit_struct, base_val));
-
-DEFINE(vp9_comp_tplist, offsetof(VP9_COMP, tplist));
-DEFINE(vp9_comp_common, offsetof(VP9_COMP, common));
-
-DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
-
-DEFINE(vp9_common_mb_rows, offsetof(VP9_COMMON, mb_rows));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
-
- * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes
- * change they will have to be adjusted.
- */
-
-#if HAVE_ARMV5TE
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)
-#endif
--- a/vp8/encoder/bitstream.c
+++ /dev/null
@@ -1,2394 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/header.h"
-#include "encodemv.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/findnearmv.h"
-#include "mcomp.h"
-#include "vp8/common/systemdependent.h"
-#include <assert.h>
-#include <stdio.h>
-#include <limits.h>
-#include "vp8/common/pragmas.h"
-#include "vpx/vpx_encoder.h"
-#include "vpx_mem/vpx_mem.h"
-#include "bitstream.h"
-#include "segmentation.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vp8/encoder/encodemv.h"
-#include "vp8/common/entropymv.h"
-
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-#endif
-
-#ifdef ENTROPY_STATS
-int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];
-unsigned int tree_update_hist [BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES][2];
-unsigned int hybrid_tree_update_hist [BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES][2];
-unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] [2];
-unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [ENTROPY_NODES] [2];
-
-extern unsigned int active_section;
-#endif
-
-#ifdef MODE_STATS
-int count_mb_seg[4] = { 0, 0, 0, 0 };
-#endif
-
-#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
-#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
-
-#define SEARCH_NEWP
-static int update_bits[255];
-
-static void compute_update_table() {
- int i;
- for (i = 0; i < 255; i++)
- update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
-}
-
-static int split_index(int i, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (i % modulus == modulus / 2) i = i / modulus;
- else i = max1 + i - (i + modulus - modulus / 2) / modulus;
- return i;
-}
-
-static int remap_prob(int v, int m) {
- const int n = 256;
- const int modulus = MODULUS_PARAM;
- int i;
- if ((m << 1) <= n)
- i = vp9_recenter_nonneg(v, m) - 1;
- else
- i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
- i = split_index(i, n - 1, modulus);
- return i;
-}
-
-static void write_prob_diff_update(vp9_writer *const bc,
- vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
-}
-
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- return update_bits[delp] * 256;
-}
-
-static void update_mode(
- vp9_writer *const bc,
- int n,
- vp9_token tok [/* n */],
- vp9_tree tree,
- vp9_prob Pnew [/* n-1 */],
- vp9_prob Pcur [/* n-1 */],
- unsigned int bct [/* n-1 */] [2],
- const unsigned int num_events[/* n */]
-) {
- unsigned int new_b = 0, old_b = 0;
- int i = 0;
-
- vp9_tree_probs_from_distribution(
- n--, tok, tree,
- Pnew, bct, num_events,
- 256, 1
- );
-
- do {
- new_b += cost_branch(bct[i], Pnew[i]);
- old_b += cost_branch(bct[i], Pcur[i]);
- } while (++i < n);
-
- if (new_b + (n << 8) < old_b) {
- int i = 0;
-
- vp9_write_bit(bc, 1);
-
- do {
- const vp9_prob p = Pnew[i];
-
- vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
- } while (++i < n);
- } else
- vp9_write_bit(bc, 0);
-}
-
-static void update_mbintra_mode_probs(VP9_COMP* const cpi,
- vp9_writer* const bc) {
- VP9_COMMON *const cm = &cpi->common;
-
- {
- vp9_prob Pnew [VP9_YMODES - 1];
- unsigned int bct [VP9_YMODES - 1] [2];
-
- update_mode(
- bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
- Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
- );
- }
-}
-
-static int get_prob(int num, int den) {
- int p;
- if (den <= 0)
- return 128;
- p = (num * 255 + (den >> 1)) / den;
- if (p > 255)
- return 255;
- else if (p < 1)
- return 1;
- return p;
-}
-
-static int get_binary_prob(int n0, int n1) {
- return get_prob(n0, n0 + n1);
-}
-
-void vp9_update_skip_probs(VP9_COMP *cpi) {
- VP9_COMMON *const pc = &cpi->common;
- int prob_skip_false[3] = {0, 0, 0};
- int k;
-
- for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
- pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
- cpi->skip_true_count[k]);
- }
-}
-
-static void update_switchable_interp_probs(VP9_COMP *cpi,
- vp9_writer* const bc) {
- VP9_COMMON *const pc = &cpi->common;
- unsigned int branch_ct[32][2];
- int i, j;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
- vp9_tree_probs_from_distribution(
- VP9_SWITCHABLE_FILTERS,
- vp9_switchable_interp_encodings, vp9_switchable_interp_tree,
- pc->fc.switchable_interp_prob[j], branch_ct,
- cpi->switchable_interp_count[j], 256, 1);
- for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
- if (pc->fc.switchable_interp_prob[j][i] < 1)
- pc->fc.switchable_interp_prob[j][i] = 1;
- vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
- }
- }
-}
-
-// This function updates the reference frame prediction stats
-static void update_refpred_stats(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- int i;
- int tot_count;
- vp9_prob new_pred_probs[PREDICTION_PROBS];
- int old_cost, new_cost;
-
- // Set the prediction probability structures to defaults
- if (cm->frame_type == KEY_FRAME) {
- // Set the prediction probabilities to defaults
- cm->ref_pred_probs[0] = 120;
- cm->ref_pred_probs[1] = 80;
- cm->ref_pred_probs[2] = 40;
-
- vpx_memset(cpi->ref_pred_probs_update, 0,
- sizeof(cpi->ref_pred_probs_update));
- } else {
- // From the prediction counts set the probabilities for each context
- for (i = 0; i < PREDICTION_PROBS; i++) {
- new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
- cpi->ref_pred_count[i][1]);
-
- // Decide whether or not to update the reference frame probs.
- // Returned costs are in 1/256 bit units.
- old_cost =
- (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +
- (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));
-
- new_cost =
- (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +
- (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));
-
- // Cost saving must be >= 8 bits (2048 in these units)
- if ((old_cost - new_cost) >= 2048) {
- cpi->ref_pred_probs_update[i] = 1;
- cm->ref_pred_probs[i] = new_pred_probs[i];
- } else
- cpi->ref_pred_probs_update[i] = 0;
-
- }
- }
-}
-
-static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- MV mv;
-
- if (mbmi->mode == SPLITMV) {
- int i;
-
- for (i = 0; i < x->partition_info->count; i++) {
- if (x->partition_info->bmi[i].mode == NEW4X4) {
- if (x->e_mbd.allow_high_precision_mv) {
- mv.row = (x->partition_info->bmi[i].mv.as_mv.row
- - best_ref_mv->as_mv.row);
- mv.col = (x->partition_info->bmi[i].mv.as_mv.col
- - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
- if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
- mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
- - second_best_ref_mv->as_mv.row);
- mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
- - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
- &cpi->NMVcount, 1);
- }
- } else {
- mv.row = (x->partition_info->bmi[i].mv.as_mv.row
- - best_ref_mv->as_mv.row);
- mv.col = (x->partition_info->bmi[i].mv.as_mv.col
- - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
- if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
- mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
- - second_best_ref_mv->as_mv.row);
- mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
- - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
- &cpi->NMVcount, 0);
- }
- }
- }
- }
- } else if (mbmi->mode == NEWMV) {
- if (x->e_mbd.allow_high_precision_mv) {
- mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
- if (mbmi->second_ref_frame) {
- mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
- }
- } else {
- mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
- if (mbmi->second_ref_frame) {
- mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
- }
- }
- }
-}
-
-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
-}
-
-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
-}
-
-#if CONFIG_SUPERBLOCKS
-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
-}
-#endif
-
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
-}
-
-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
-}
-
-
-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
- write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
-}
-
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
- write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-
-static int prob_update_savings(const unsigned int *ct,
- const vp9_prob oldp, const vp9_prob newp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- const int new_b = cost_branch256(ct, newp);
- const int update_b = 2048 + vp9_cost_upd256;
- return (old_b - new_b - update_b);
-}
-
-static int prob_diff_update_savings(const unsigned int *ct,
- const vp9_prob oldp, const vp9_prob newp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- const int new_b = cost_branch256(ct, newp);
- const int update_b = (newp == oldp ? 0 :
- prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);
- return (old_b - new_b - update_b);
-}
-
-static int prob_diff_update_savings_search(const unsigned int *ct,
- const vp9_prob oldp, vp9_prob *bestp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- int new_b, update_b, savings, bestsavings, step;
- vp9_prob newp, bestnewp;
-
- bestsavings = 0;
- bestnewp = oldp;
-
- step = (*bestp > oldp ? -1 : 1);
- for (newp = *bestp; newp != oldp; newp += step) {
- new_b = cost_branch256(ct, newp);
- update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
- }
- }
- *bestp = bestnewp;
- return bestsavings;
-}
-
-static void pack_mb_tokens(vp9_writer* const bc,
- TOKENEXTRA **tp,
- const TOKENEXTRA *const stop) {
- unsigned int split;
- unsigned int shift;
- int count = bc->count;
- unsigned int range = bc->range;
- unsigned int lowvalue = bc->lowvalue;
- TOKENEXTRA *p = *tp;
-
- while (p < stop) {
- const int t = p->Token;
- vp9_token *const a = vp9_coef_encodings + t;
- const vp9_extra_bit_struct *const b = vp9_extra_bits + t;
- int i = 0;
- const unsigned char *pp = p->context_tree;
- int v = a->value;
- int n = a->Len;
-
- if (t == EOSB_TOKEN)
- {
- ++p;
- break;
- }
-
- /* skip one or two nodes */
- if (p->skip_eob_node) {
- n -= p->skip_eob_node;
- i = 2 * p->skip_eob_node;
- }
-
- do {
- const int bb = (v >> --n) & 1;
- split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
- i = vp9_coef_tree[i + bb];
-
- if (bb) {
- lowvalue += split;
- range = range - split;
- } else {
- range = split;
- }
-
- shift = vp9_norm[range];
- range <<= shift;
- count += shift;
-
- if (count >= 0) {
- int offset = shift - count;
-
- if ((lowvalue << (offset - 1)) & 0x80000000) {
- int x = bc->pos - 1;
-
- while (x >= 0 && bc->buffer[x] == 0xff) {
- bc->buffer[x] = (unsigned char)0;
- x--;
- }
-
- bc->buffer[x] += 1;
- }
-
- bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
- lowvalue <<= offset;
- shift = count;
- lowvalue &= 0xffffff;
- count -= 8;
- }
-
- lowvalue <<= shift;
- } while (n);
-
-
- if (b->base_val) {
- const int e = p->Extra, L = b->Len;
-
- if (L) {
- const unsigned char *pp = b->prob;
- int v = e >> 1;
- int n = L; /* number of bits in v, assumed nonzero */
- int i = 0;
-
- do {
- const int bb = (v >> --n) & 1;
- split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
- i = b->tree[i + bb];
-
- if (bb) {
- lowvalue += split;
- range = range - split;
- } else {
- range = split;
- }
-
- shift = vp9_norm[range];
- range <<= shift;
- count += shift;
-
- if (count >= 0) {
- int offset = shift - count;
-
- if ((lowvalue << (offset - 1)) & 0x80000000) {
- int x = bc->pos - 1;
-
- while (x >= 0 && bc->buffer[x] == 0xff) {
- bc->buffer[x] = (unsigned char)0;
- x--;
- }
-
- bc->buffer[x] += 1;
- }
-
- bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
- lowvalue <<= offset;
- shift = count;
- lowvalue &= 0xffffff;
- count -= 8;
- }
-
- lowvalue <<= shift;
- } while (n);
- }
-
-
- {
-
- split = (range + 1) >> 1;
-
- if (e & 1) {
- lowvalue += split;
- range = range - split;
- } else {
- range = split;
- }
-
- range <<= 1;
-
- if ((lowvalue & 0x80000000)) {
- int x = bc->pos - 1;
-
- while (x >= 0 && bc->buffer[x] == 0xff) {
- bc->buffer[x] = (unsigned char)0;
- x--;
- }
-
- bc->buffer[x] += 1;
-
- }
-
- lowvalue <<= 1;
-
- if (!++count) {
- count = -8;
- bc->buffer[bc->pos++] = (lowvalue >> 24);
- lowvalue &= 0xffffff;
- }
- }
-
- }
- ++p;
- }
-
- bc->count = count;
- bc->lowvalue = lowvalue;
- bc->range = range;
- *tp = p;
-}
-
-static void write_partition_size(unsigned char *cx_data, int size) {
- signed char csize;
-
- csize = size & 0xff;
- *cx_data = csize;
- csize = (size >> 8) & 0xff;
- *(cx_data + 1) = csize;
- csize = (size >> 16) & 0xff;
- *(cx_data + 2) = csize;
-
-}
-
-static void write_mv_ref
-(
- vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
- assert(NEARESTMV <= m && m <= SPLITMV);
-#endif
- write_token(bc, vp9_mv_ref_tree, p,
- vp9_mv_ref_encoding_array - NEARESTMV + m);
-}
-
-#if CONFIG_SUPERBLOCKS
-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
- const vp9_prob *p) {
-#if CONFIG_DEBUG
- assert(NEARESTMV <= m && m < SPLITMV);
-#endif
- write_token(bc, vp9_sb_mv_ref_tree, p,
- vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
-}
-#endif
-
-static void write_sub_mv_ref
-(
- vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
- assert(LEFT4X4 <= m && m <= NEW4X4);
-#endif
- write_token(bc, vp9_sub_mv_ref_tree, p,
- vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
-}
-
-static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,
- const nmv_context *nmvc, int usehp) {
- MV e;
- e.row = mv->row - ref->as_mv.row;
- e.col = mv->col - ref->as_mv.col;
-
- vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);
- vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
-}
-
-#if CONFIG_NEW_MVREF
-static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {
- int cost;
-
- // Encode the index for the MV reference.
- switch (mv_ref_id) {
- case 0:
- cost = vp9_cost_zero(ref_id_probs[0]);
- break;
- case 1:
- cost = vp9_cost_one(ref_id_probs[0]);
- cost += vp9_cost_zero(ref_id_probs[1]);
- break;
- case 2:
- cost = vp9_cost_one(ref_id_probs[0]);
- cost += vp9_cost_one(ref_id_probs[1]);
- cost += vp9_cost_zero(ref_id_probs[2]);
- break;
- case 3:
- cost = vp9_cost_one(ref_id_probs[0]);
- cost += vp9_cost_one(ref_id_probs[1]);
- cost += vp9_cost_one(ref_id_probs[2]);
- break;
-
- // TRAP.. This should not happen
- default:
- assert(0);
- break;
- }
-
- return cost;
-}
-
-static void vp9_write_mv_ref_id(vp9_writer *w,
- vp9_prob * ref_id_probs,
- int mv_ref_id) {
- // Encode the index for the MV reference.
- switch (mv_ref_id) {
- case 0:
- vp9_write(w, 0, ref_id_probs[0]);
- break;
- case 1:
- vp9_write(w, 1, ref_id_probs[0]);
- vp9_write(w, 0, ref_id_probs[1]);
- break;
- case 2:
- vp9_write(w, 1, ref_id_probs[0]);
- vp9_write(w, 1, ref_id_probs[1]);
- vp9_write(w, 0, ref_id_probs[2]);
- break;
- case 3:
- vp9_write(w, 1, ref_id_probs[0]);
- vp9_write(w, 1, ref_id_probs[1]);
- vp9_write(w, 1, ref_id_probs[2]);
- break;
-
- // TRAP.. This should not happen
- default:
- assert(0);
- break;
- }
-}
-
-// Estimate the cost of each coding the vector using each reference candidate
-static unsigned int pick_best_mv_ref(MACROBLOCK *x,
- MV_REFERENCE_FRAME ref_frame,
- int_mv target_mv,
- int_mv * mv_ref_list,
- int_mv * best_ref) {
- int i;
- int best_index = 0;
- int cost, cost2;
- int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;
- MACROBLOCKD *xd = &x->e_mbd;
- int max_mv = MV_MAX;
-
- cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +
- vp9_mv_bit_cost(&target_mv,
- &mv_ref_list[0],
- XMVCOST, 96,
- xd->allow_high_precision_mv);
-
-
- // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {
- for (i = 1; i < 4; ++i) {
- // If we see a 0,0 reference vector for a second time we have reached
- // the end of the list of valid candidate vectors.
- if (!mv_ref_list[i].as_int)
- if (zero_seen)
- break;
- else
- zero_seen = TRUE;
-
- // Check for cases where the reference choice would give rise to an
- // uncodable/out of range residual for row or col.
- if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||
- (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {
- continue;
- }
-
- cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +
- vp9_mv_bit_cost(&target_mv,
- &mv_ref_list[i],
- XMVCOST, 96,
- xd->allow_high_precision_mv);
-
- if (cost2 < cost) {
- cost = cost2;
- best_index = i;
- }
- }
-
- (*best_ref).as_int = mv_ref_list[best_index].as_int;
-
- return best_index;
-}
-#endif
-
-// This function writes the current macro block's segnment id to the bitstream
-// It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp9_writer *bc,
- const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
- // Encode the MB segment id.
- int seg_id = mi->segment_id;
-#if CONFIG_SUPERBLOCKS
- if (mi->encoded_as_sb) {
- if (xd->mb_to_right_edge > 0)
- seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
- if (xd->mb_to_bottom_edge > 0) {
- seg_id = seg_id &&
- xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
- if (xd->mb_to_right_edge > 0)
- seg_id = seg_id &&
- xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
- }
- }
-#endif
- if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
- switch (seg_id) {
- case 0:
- vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
- vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
- break;
- case 1:
- vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
- vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
- break;
- case 2:
- vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
- vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
- break;
- case 3:
- vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
- vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
- break;
-
- // TRAP.. This should not happen
- default:
- vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
- vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
- break;
- }
- }
-}
-
-// This function encodes the reference frame
-static void encode_ref_frame(vp9_writer *const bc,
- VP9_COMMON *const cm,
- MACROBLOCKD *xd,
- int segment_id,
- MV_REFERENCE_FRAME rf) {
- int seg_ref_active;
- int seg_ref_count = 0;
- seg_ref_active = vp9_segfeature_active(xd,
- segment_id,
- SEG_LVL_REF_FRAME);
-
- if (seg_ref_active) {
- seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
- vp9_check_segref(xd, segment_id, LAST_FRAME) +
- vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
- vp9_check_segref(xd, segment_id, ALTREF_FRAME);
- }
-
- // If segment level coding of this signal is disabled...
- // or the segment allows multiple reference frame options
- if (!seg_ref_active || (seg_ref_count > 1)) {
- // Values used in prediction model coding
- unsigned char prediction_flag;
- vp9_prob pred_prob;
- MV_REFERENCE_FRAME pred_rf;
-
- // Get the context probability the prediction flag
- pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
- // Get the predicted value.
- pred_rf = vp9_get_pred_ref(cm, xd);
-
- // Did the chosen reference frame match its predicted value.
- prediction_flag =
- (xd->mode_info_context->mbmi.ref_frame == pred_rf);
-
- vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
- vp9_write(bc, prediction_flag, pred_prob);
-
- // If not predicted correctly then code value explicitly
- if (!prediction_flag) {
- vp9_prob mod_refprobs[PREDICTION_PROBS];
-
- vpx_memcpy(mod_refprobs,
- cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));
-
- // If segment coding enabled blank out options that cant occur by
- // setting the branch probability to 0.
- if (seg_ref_active) {
- mod_refprobs[INTRA_FRAME] *=
- vp9_check_segref(xd, segment_id, INTRA_FRAME);
- mod_refprobs[LAST_FRAME] *=
- vp9_check_segref(xd, segment_id, LAST_FRAME);
- mod_refprobs[GOLDEN_FRAME] *=
- (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
- vp9_check_segref(xd, segment_id, ALTREF_FRAME));
- }
-
- if (mod_refprobs[0]) {
- vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
- }
-
- // Inter coded
- if (rf != INTRA_FRAME) {
- if (mod_refprobs[1]) {
- vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
- }
-
- if (rf != LAST_FRAME) {
- if (mod_refprobs[2]) {
- vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
- }
- }
- }
- }
- }
-
- // if using the prediction mdoel we have nothing further to do because
- // the reference frame is fully coded by the segment
-}
-
-// Update the probabilities used to encode reference frame data
-static void update_ref_probs(VP9_COMP *const cpi) {
- VP9_COMMON *const cm = &cpi->common;
-
- const int *const rfct = cpi->count_mb_ref_frame_usage;
- const int rf_intra = rfct[INTRA_FRAME];
- const int rf_inter = rfct[LAST_FRAME] +
- rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
- cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
- cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
- cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
-
- // Compute a modified set of probabilities to use when prediction of the
- // reference frame fails
- vp9_compute_mod_refprobs(cm);
-}
-
-static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
- int i;
- VP9_COMMON *const pc = &cpi->common;
- const nmv_context *nmvc = &pc->fc.nmvc;
- MACROBLOCK *x = &cpi->mb;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- MODE_INFO *m;
- MODE_INFO *prev_m;
- TOKENEXTRA *tok = cpi->tok;
- TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
- const int mis = pc->mode_info_stride;
- int mb_row, mb_col;
- int row, col;
-
- // Values used in prediction model coding
- vp9_prob pred_prob;
- unsigned char prediction_flag;
-
- int row_delta[4] = { 0, +1, 0, -1};
- int col_delta[4] = { +1, -1, +1, +1};
-
- cpi->mb.partition_info = cpi->mb.pi;
-
- mb_row = 0;
- for (row = 0; row < pc->mb_rows; row += 2) {
- m = pc->mi + row * mis;
- prev_m = pc->prev_mi + row * mis;
-
- mb_col = 0;
- for (col = 0; col < pc->mb_cols; col += 2) {
- int i;
-
- // Process the 4 MBs in the order:
- // top-left, top-right, bottom-left, bottom-right
-#if CONFIG_SUPERBLOCKS
- vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
-#endif
- for (i = 0; i < 4; i++) {
- MB_MODE_INFO *mi;
- MV_REFERENCE_FRAME rf;
- MB_PREDICTION_MODE mode;
- int segment_id;
-
- int dy = row_delta[i];
- int dx = col_delta[i];
- int offset_extended = dy * mis + dx;
-
- if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
- // MB lies outside frame, move on
- mb_row += dy;
- mb_col += dx;
- m += offset_extended;
- prev_m += offset_extended;
- cpi->mb.partition_info += offset_extended;
- continue;
- }
-
- mi = &m->mbmi;
- rf = mi->ref_frame;
- mode = mi->mode;
- segment_id = mi->segment_id;
-
- // Distance of Mb to the various image edges.
- // These specified to 8th pel as they are always compared to MV
- // values that are in 1/8th pel units
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
- xd->mb_to_top_edge = -((mb_row * 16)) << 3;
- xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-
- // Make sure the MacroBlockD mode info pointer is set correctly
- xd->mode_info_context = m;
- xd->prev_mode_info_context = prev_m;
-
-#ifdef ENTROPY_STATS
- active_section = 9;
-#endif
- if (cpi->mb.e_mbd.update_mb_segmentation_map) {
- // Is temporal coding of the segment map enabled
- if (pc->temporal_update) {
- prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
- pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
-
- // Code the segment id prediction flag for this mb
- vp9_write(bc, prediction_flag, pred_prob);
-
- // If the mb segment id wasn't predicted code explicitly
- if (!prediction_flag)
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
- } else {
- // Normal unpredicted coding
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
- }
- }
-
- if (pc->mb_no_coeff_skip &&
- (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
- int skip_coeff = mi->mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
- if (mi->encoded_as_sb) {
- skip_coeff &= m[1].mbmi.mb_skip_coeff;
- skip_coeff &= m[mis].mbmi.mb_skip_coeff;
- skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
- }
-#endif
- vp9_write(bc, skip_coeff,
- vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
- }
-
- // Encode the reference frame.
- encode_ref_frame(bc, pc, xd, segment_id, rf);
-
- if (rf == INTRA_FRAME) {
-#ifdef ENTROPY_STATS
- active_section = 6;
-#endif
-
- // TODO(rbultje) write using SB tree structure
-
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
- write_ymode(bc, mode, pc->fc.ymode_prob);
- }
-
- if (mode == B_PRED) {
- int j = 0;
-#if CONFIG_COMP_INTRA_PRED
- int uses_second =
- m->bmi[0].as_mode.second !=
- (B_PREDICTION_MODE)(B_DC_PRED - 1);
- vp9_write(bc, uses_second, 128);
-#endif
- do {
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
-#endif
- write_bmode(bc, m->bmi[j].as_mode.first,
- pc->fc.bmode_prob);
- /*
- if (!cpi->dummy_packing) {
- int p;
- for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
- printf(" %d", pc->fc.bmode_prob[p]);
- printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);
- }
- */
-#if CONFIG_COMP_INTRA_PRED
- if (uses_second) {
- write_bmode(bc, mode2, pc->fc.bmode_prob);
- }
-#endif
- } while (++j < 16);
- }
- if (mode == I8X8_PRED) {
- write_i8x8_mode(bc, m->bmi[0].as_mode.first,
- pc->fc.i8x8_mode_prob);
- write_i8x8_mode(bc, m->bmi[2].as_mode.first,
- pc->fc.i8x8_mode_prob);
- write_i8x8_mode(bc, m->bmi[8].as_mode.first,
- pc->fc.i8x8_mode_prob);
- write_i8x8_mode(bc, m->bmi[10].as_mode.first,
- pc->fc.i8x8_mode_prob);
- } else {
- write_uv_mode(bc, mi->uv_mode,
- pc->fc.uv_mode_prob[mode]);
- }
- } else {
- int_mv best_mv, best_second_mv;
- int ct[4];
-
- vp9_prob mv_ref_p [VP9_MVREFS - 1];
-
- {
- int_mv n1, n2;
-
- // Only used for context just now and soon to be deprecated.
- vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
- rf, cpi->common.ref_frame_sign_bias);
-#if CONFIG_NEWBESTREFMV
- best_mv.as_int = mi->ref_mvs[rf][0].as_int;
-#endif
-
- vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);
-
-#ifdef ENTROPY_STATS
- accum_mv_refs(mode, ct);
-#endif
- }
-
-#ifdef ENTROPY_STATS
- active_section = 3;
-#endif
-
- // Is the segment coding of mode enabled
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
- if (mi->encoded_as_sb) {
- write_sb_mv_ref(bc, mode, mv_ref_p);
- } else
-#endif
- {
- write_mv_ref(bc, mode, mv_ref_p);
- }
- vp9_accum_mv_refs(&cpi->common, mode, ct);
- }
-
-#if CONFIG_PRED_FILTER
- // Is the prediction filter enabled
- if (mode >= NEARESTMV && mode < SPLITMV) {
- if (cpi->common.pred_filter_mode == 2)
- vp9_write(bc, mi->pred_filter_enabled,
- pc->prob_pred_filter_off);
- else
- assert(mi->pred_filter_enabled ==
- cpi->common.pred_filter_mode);
- }
-#endif
- if (mode >= NEARESTMV && mode <= SPLITMV)
- {
- if (cpi->common.mcomp_filter_type == SWITCHABLE) {
- write_token(bc, vp9_switchable_interp_tree,
- vp9_get_pred_probs(&cpi->common, xd,
- PRED_SWITCHABLE_INTERP),
- vp9_switchable_interp_encodings +
- vp9_switchable_interp_map[mi->interp_filter]);
- } else {
- assert (mi->interp_filter ==
- cpi->common.mcomp_filter_type);
- }
- }
- if (mi->second_ref_frame &&
- (mode == NEWMV || mode == SPLITMV)) {
- int_mv n1, n2;
-
- // Only used for context just now and soon to be deprecated.
- vp9_find_near_mvs(xd, m, prev_m,
- &n1, &n2, &best_second_mv, ct,
- mi->second_ref_frame,
- cpi->common.ref_frame_sign_bias);
-
-#if CONFIG_NEWBESTREFMV
- best_second_mv.as_int =
- mi->ref_mvs[mi->second_ref_frame][0].as_int;
-#endif
- }
-
- // does the feature use compound prediction or not
- // (if not specified at the frame/segment level)
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
- vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_COMP));
- }
-
- {
- switch (mode) { /* new, split require MVs */
- case NEWMV:
-#ifdef ENTROPY_STATS
- active_section = 5;
-#endif
-
-#if CONFIG_NEW_MVREF
- {
- unsigned int best_index;
-
- // Choose the best mv reference
- best_index = pick_best_mv_ref(x, rf, mi->mv[0],
- mi->ref_mvs[rf], &best_mv);
-
- // Encode the index of the choice.
- vp9_write_mv_ref_id(bc,
- xd->mb_mv_ref_id_probs[rf], best_index);
-
- cpi->best_ref_index_counts[rf][best_index]++;
-
- }
-#endif
-
- write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
- (const nmv_context*) nmvc,
- xd->allow_high_precision_mv);
-
- if (mi->second_ref_frame) {
-#if CONFIG_NEW_MVREF
- unsigned int best_index;
- MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;
-
- best_index =
- pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],
- mi->ref_mvs[sec_ref_frame],
- &best_second_mv);
-
- // Encode the index of the choice.
- vp9_write_mv_ref_id(bc,
- xd->mb_mv_ref_id_probs[sec_ref_frame],
- best_index);
-
- cpi->best_ref_index_counts[sec_ref_frame][best_index]++;
-#endif
- write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
- (const nmv_context*) nmvc,
- xd->allow_high_precision_mv);
- }
- break;
- case SPLITMV: {
- int j = 0;
-
-#ifdef MODE_STATS
- ++count_mb_seg [mi->partitioning];
-#endif
-
- write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
- cpi->mbsplit_count[mi->partitioning]++;
-
- do {
- B_PREDICTION_MODE blockmode;
- int_mv blockmv;
- const int *const L =
- vp9_mbsplits [mi->partitioning];
- int k = -1; /* first block in subset j */
- int mv_contz;
- int_mv leftmv, abovemv;
-
- blockmode = cpi->mb.partition_info->bmi[j].mode;
- blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_DEBUG
- while (j != L[++k])
- if (k >= 16)
- assert(0);
-#else
- while (j != L[++k]);
-#endif
- leftmv.as_int = left_block_mv(m, k);
- abovemv.as_int = above_block_mv(m, k, mis);
- mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-
- write_sub_mv_ref(bc, blockmode,
- cpi->common.fc.sub_mv_ref_prob [mv_contz]);
- cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
- if (blockmode == NEW4X4) {
-#ifdef ENTROPY_STATS
- active_section = 11;
-#endif
- write_nmv(bc, &blockmv.as_mv, &best_mv,
- (const nmv_context*) nmvc,
- xd->allow_high_precision_mv);
-
- if (mi->second_ref_frame) {
- write_nmv(bc,
- &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
- &best_second_mv,
- (const nmv_context*) nmvc,
- xd->allow_high_precision_mv);
- }
- }
- } while (++j < cpi->mb.partition_info->count);
- }
- break;
- default:
- break;
- }
- }
-
- // Update the mvcounts used to tune mv probs but only if this is
- // the real pack run.
- if ( !cpi->dummy_packing ) {
- update_mvcount(cpi, x, &best_mv, &best_second_mv);
- }
- }
-
- if (
-#if CONFIG_SUPERBLOCKS
- !mi->encoded_as_sb &&
-#endif
- ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
- (rf != INTRA_FRAME && !(mode == SPLITMV &&
- mi->partitioning == PARTITIONING_4X4))) &&
- pc->txfm_mode == TX_MODE_SELECT &&
- !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
- (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
- TX_SIZE sz = mi->txfm_size;
- // FIXME(rbultje) code ternary symbol once all experiments are merged
- vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
- if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
- vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
- }
-
-#ifdef ENTROPY_STATS
- active_section = 1;
-#endif
- assert(tok < tok_end);
- pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb) {
- assert(!i);
- mb_col += 2;
- m += 2;
- cpi->mb.partition_info += 2;
- prev_m += 2;
- break;
- }
-#endif
-
- // Next MB
- mb_row += dy;
- mb_col += dx;
- m += offset_extended;
- prev_m += offset_extended;
- cpi->mb.partition_info += offset_extended;
-#if CONFIG_DEBUG
- assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
- assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
-#endif
- }
- }
-
- // Next SB
- mb_row += 2;
- m += mis + (1 - (pc->mb_cols & 0x1));
- prev_m += mis + (1 - (pc->mb_cols & 0x1));
- cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
- }
-}
-
-
-static void write_mb_modes_kf(const VP9_COMMON *c,
- const MACROBLOCKD *xd,
- const MODE_INFO *m,
- int mode_info_stride,
- vp9_writer *const bc) {
- const int mis = mode_info_stride;
- int ym;
- int segment_id;
-
- ym = m->mbmi.mode;
- segment_id = m->mbmi.segment_id;
-
- if (xd->update_mb_segmentation_map) {
- write_mb_segid(bc, &m->mbmi, xd);
- }
-
- if (c->mb_no_coeff_skip &&
- (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
- int skip_coeff = m->mbmi.mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb) {
- skip_coeff &= m[1].mbmi.mb_skip_coeff;
- skip_coeff &= m[mis].mbmi.mb_skip_coeff;
- skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
- }
-#endif
- vp9_write(bc, skip_coeff,
- vp9_get_pred_prob(c, xd, PRED_MBSKIP));
- }
-
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb) {
- sb_kfwrite_ymode(bc, ym,
- c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
- } else
-#endif
- {
- kfwrite_ymode(bc, ym,
- c->kf_ymode_prob[c->kf_ymode_probs_index]);
- }
-
- if (ym == B_PRED) {
- const int mis = c->mode_info_stride;
- int i = 0;
-#if CONFIG_COMP_INTRA_PRED
- int uses_second =
- m->bmi[0].as_mode.second !=
- (B_PREDICTION_MODE)(B_DC_PRED - 1);
- vp9_write(bc, uses_second, 128);
-#endif
- do {
- const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
- const B_PREDICTION_MODE L = left_block_mode(m, i);
- const int bm = m->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- const int bm2 = m->bmi[i].as_mode.second;
-#endif
-
-#ifdef ENTROPY_STATS
- ++intra_mode_stats [A] [L] [bm];
-#endif
-
- write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
- // printf(" mode: %d\n", bm);
-#if CONFIG_COMP_INTRA_PRED
- if (uses_second) {
- write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
- }
-#endif
- } while (++i < 16);
- }
- if (ym == I8X8_PRED) {
- write_i8x8_mode(bc, m->bmi[0].as_mode.first,
- c->fc.i8x8_mode_prob);
- // printf(" mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
- write_i8x8_mode(bc, m->bmi[2].as_mode.first,
- c->fc.i8x8_mode_prob);
- // printf(" mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
- write_i8x8_mode(bc, m->bmi[8].as_mode.first,
- c->fc.i8x8_mode_prob);
- // printf(" mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
- write_i8x8_mode(bc, m->bmi[10].as_mode.first,
- c->fc.i8x8_mode_prob);
- // printf(" mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
- } else
- write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
-
- if (
-#if CONFIG_SUPERBLOCKS
- !m->mbmi.encoded_as_sb &&
-#endif
- ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
- !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
- (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
- TX_SIZE sz = m->mbmi.txfm_size;
- // FIXME(rbultje) code ternary symbol once all experiments are merged
- vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
- if (sz != TX_4X4 && ym <= TM_PRED)
- vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
- }
-}
-
-static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
- VP9_COMMON *const c = &cpi->common;
- const int mis = c->mode_info_stride;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- MODE_INFO *m;
- int i;
- int row, col;
- int mb_row, mb_col;
- int row_delta[4] = { 0, +1, 0, -1};
- int col_delta[4] = { +1, -1, +1, +1};
- TOKENEXTRA *tok = cpi->tok;
- TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
- mb_row = 0;
- for (row = 0; row < c->mb_rows; row += 2) {
- m = c->mi + row * mis;
-
- mb_col = 0;
- for (col = 0; col < c->mb_cols; col += 2) {
-#if CONFIG_SUPERBLOCKS
- vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
-#endif
- // Process the 4 MBs in the order:
- // top-left, top-right, bottom-left, bottom-right
- for (i = 0; i < 4; i++) {
- int dy = row_delta[i];
- int dx = col_delta[i];
- int offset_extended = dy * mis + dx;
-
- if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
- // MB lies outside frame, move on
- mb_row += dy;
- mb_col += dx;
- m += offset_extended;
- continue;
- }
-
- // Make sure the MacroBlockD mode info pointer is set correctly
- xd->mode_info_context = m;
-
- write_mb_modes_kf(c, xd, m, mis, bc);
-#ifdef ENTROPY_STATS
- active_section = 8;
-#endif
- assert(tok < tok_end);
- pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
- if (m->mbmi.encoded_as_sb) {
- assert(!i);
- mb_col += 2;
- m += 2;
- break;
- }
-#endif
- // Next MB
- mb_row += dy;
- mb_col += dx;
- m += offset_extended;
- }
- }
- mb_row += 2;
- }
-}
-
-
-/* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_prob
- coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
- /* print coef probability tree */
- int i, j, k, l;
- FILE *f = fopen("enc_tree_probs.txt", "a");
- fprintf(f, "{\n");
- for (i = 0; i < BLOCK_TYPES; i++) {
- fprintf(f, " {\n");
- for (j = 0; j < COEF_BANDS; j++) {
- fprintf(f, " {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- fprintf(f, " {");
- for (l = 0; l < ENTROPY_NODES; l++) {
- fprintf(f, "%3u, ",
- (unsigned int)(coef_probs [i][j][k][l]));
- }
- fprintf(f, " }\n");
- }
- fprintf(f, " }\n");
- }
- fprintf(f, " }\n");
- }
- fprintf(f, "}\n");
- fclose(f);
-}
-
-static void build_coeff_contexts(VP9_COMP *cpi) {
- int i = 0, j, k;
-#ifdef ENTROPY_STATS
- int t = 0;
-#endif
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_coef_probs [i][j][k],
- cpi->frame_branch_ct [i][j][k],
- cpi->coef_counts [i][j][k],
- 256, 1
- );
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
-#endif
- }
- }
- }
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_hybrid_coef_probs [i][j][k],
- cpi->frame_hybrid_branch_ct [i][j][k],
- cpi->hybrid_coef_counts [i][j][k],
- 256, 1
- );
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];
-#endif
- }
- }
- }
-
- if (cpi->common.txfm_mode != ONLY_4X4) {
- for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- /* at every context */
- /* calc probs and branch cts for this frame only */
- // vp9_prob new_p [ENTROPY_NODES];
- // unsigned int branch_ct [ENTROPY_NODES] [2];
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_coef_probs_8x8 [i][j][k],
- cpi->frame_branch_ct_8x8 [i][j][k],
- cpi->coef_counts_8x8 [i][j][k],
- 256, 1
- );
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
-#endif
- }
- }
- }
- for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- /* at every context */
- /* calc probs and branch cts for this frame only */
- // vp9_prob new_p [ENTROPY_NODES];
- // unsigned int branch_ct [ENTROPY_NODES] [2];
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_hybrid_coef_probs_8x8 [i][j][k],
- cpi->frame_hybrid_branch_ct_8x8 [i][j][k],
- cpi->hybrid_coef_counts_8x8 [i][j][k],
- 256, 1
- );
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];
-#endif
- }
- }
- }
- }
-
- if (cpi->common.txfm_mode > ALLOW_8X8) {
- for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_coef_probs_16x16[i][j][k],
- cpi->frame_branch_ct_16x16[i][j][k],
- cpi->coef_counts_16x16[i][j][k], 256, 1);
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
-#endif
- }
- }
- }
- }
- for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
- for (j = 0; j < COEF_BANDS; ++j) {
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- cpi->frame_hybrid_coef_probs_16x16[i][j][k],
- cpi->frame_hybrid_branch_ct_16x16[i][j][k],
- cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
-#endif
- }
- }
- }
-}
-
-static void update_coef_probs_common(
- vp9_writer* const bc,
- vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES],
- vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES],
- unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {
- int i, j, k, t;
- int update[2] = {0, 0};
- int savings;
- // vp9_prob bestupd = find_coef_update_prob(cpi);
-
- /* dry run to see if there is any udpate at all needed */
- savings = 0;
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = !i; j < COEF_BANDS; ++j) {
- int prev_coef_savings[ENTROPY_NODES] = {0};
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- for (t = 0; t < ENTROPY_NODES; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][t];
- const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];
- const vp9_prob upd = COEF_UPDATE_PROB;
- int s = prev_coef_savings[t];
- int u = 0;
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
-#if defined(SEARCH_NEWP)
- s = prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][t],
- oldp, &newp, upd);
- if (s > 0 && newp != oldp)
- u = 1;
- if (u)
- savings += s - (int)(vp9_cost_zero(upd));
- else
- savings -= (int)(vp9_cost_zero(upd));
-#else
- s = prob_update_savings(
- frame_branch_ct[i][j][k][t],
- oldp, newp, upd);
- if (s > 0)
- u = 1;
- if (u)
- savings += s;
-#endif
-
- update[u]++;
- }
- }
- }
- }
-
- // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
- /* Is coef updated at all */
- if (update[1] == 0 || savings < 0) {
- vp9_write_bit(bc, 0);
- } else {
- vp9_write_bit(bc, 1);
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = !i; j < COEF_BANDS; ++j) {
- int prev_coef_savings[ENTROPY_NODES] = {0};
- for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
- // calc probs and branch cts for this frame only
- for (t = 0; t < ENTROPY_NODES; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][t];
- vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;
- const vp9_prob upd = COEF_UPDATE_PROB;
- int s = prev_coef_savings[t];
- int u = 0;
- if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
- continue;
-
-#if defined(SEARCH_NEWP)
- s = prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][t],
- *oldp, &newp, upd);
- if (s > 0 && newp != *oldp)
- u = 1;
-#else
- s = prob_update_savings(
- frame_branch_ct[i][j][k][t],
- *oldp, newp, upd);
- if (s > 0)
- u = 1;
-#endif
- vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- ++ tree_update_hist [i][j][k][t] [u];
-#endif
- if (u) {
- /* send/use new probability */
- write_prob_diff_update(bc, newp, *oldp);
- *oldp = newp;
- }
- }
- }
- }
- }
- }
-}
-
-static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
- vp9_clear_system_state();
-
- // Build the cofficient contexts based on counts collected in encode loop
- build_coeff_contexts(cpi);
-
- update_coef_probs_common(bc,
- cpi->frame_coef_probs,
- cpi->common.fc.coef_probs,
- cpi->frame_branch_ct);
-
- update_coef_probs_common(bc,
- cpi->frame_hybrid_coef_probs,
- cpi->common.fc.hybrid_coef_probs,
- cpi->frame_hybrid_branch_ct);
-
- /* do not do this if not even allowed */
- if (cpi->common.txfm_mode != ONLY_4X4) {
- update_coef_probs_common(bc,
- cpi->frame_coef_probs_8x8,
- cpi->common.fc.coef_probs_8x8,
- cpi->frame_branch_ct_8x8);
-
- update_coef_probs_common(bc,
- cpi->frame_hybrid_coef_probs_8x8,
- cpi->common.fc.hybrid_coef_probs_8x8,
- cpi->frame_hybrid_branch_ct_8x8);
- }
-
- if (cpi->common.txfm_mode > ALLOW_8X8) {
- update_coef_probs_common(bc,
- cpi->frame_coef_probs_16x16,
- cpi->common.fc.coef_probs_16x16,
- cpi->frame_branch_ct_16x16);
- update_coef_probs_common(bc,
- cpi->frame_hybrid_coef_probs_16x16,
- cpi->common.fc.hybrid_coef_probs_16x16,
- cpi->frame_hybrid_branch_ct_16x16);
- }
-}
-
-#ifdef PACKET_TESTING
-FILE *vpxlogc = 0;
-#endif
-
-static void put_delta_q(vp9_writer *bc, int delta_q) {
- if (delta_q != 0) {
- vp9_write_bit(bc, 1);
- vp9_write_literal(bc, abs(delta_q), 4);
-
- if (delta_q < 0)
- vp9_write_bit(bc, 1);
- else
- vp9_write_bit(bc, 0);
- } else
- vp9_write_bit(bc, 0);
-}
-
-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
-
- int mode_cost[MB_MODE_COUNT];
- int cost;
- int bestcost = INT_MAX;
- int bestindex = 0;
- int i, j;
-
- for (i = 0; i < 8; i++) {
- vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
- cost = 0;
- for (j = 0; j < VP9_YMODES; j++) {
- cost += mode_cost[j] * cpi->ymode_count[j];
- }
-#if CONFIG_SUPERBLOCKS
- vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
- vp9_sb_ymode_tree);
- for (j = 0; j < VP9_I32X32_MODES; j++) {
- cost += mode_cost[j] * cpi->sb_ymode_count[j];
- }
-#endif
- if (cost < bestcost) {
- bestindex = i;
- bestcost = cost;
- }
- }
- cpi->common.kf_ymode_probs_index = bestindex;
-
-}
-static void segment_reference_frames(VP9_COMP *cpi) {
- VP9_COMMON *oci = &cpi->common;
- MODE_INFO *mi = oci->mi;
- int ref[MAX_MB_SEGMENTS] = {0};
- int i, j;
- int mb_index = 0;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
- for (i = 0; i < oci->mb_rows; i++) {
- for (j = 0; j < oci->mb_cols; j++, mb_index++) {
- ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);
- }
- mb_index++;
- }
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
- vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);
- vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);
- }
-}
-
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
- unsigned long *size) {
- int i, j;
- VP9_HEADER oh;
- VP9_COMMON *const pc = &cpi->common;
- vp9_writer header_bc, residual_bc;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- int extra_bytes_packed = 0;
-
- unsigned char *cx_data = dest;
-
- oh.show_frame = (int) pc->show_frame;
- oh.type = (int)pc->frame_type;
- oh.version = pc->version;
- oh.first_partition_length_in_bytes = 0;
-
- cx_data += 3;
-
-#if defined(SECTIONBITS_OUTPUT)
- Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
-#endif
-
- compute_update_table();
-
- /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once
- * for each K frame before encode frame. pc->kf_bmode_prob doesn't get
- * changed anywhere else. No need to call it again here. --yw
- * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);
- */
-
- /* every keyframe send startcode, width, height, scale factor, clamp
- * and color type.
- */
- if (oh.type == KEY_FRAME) {
- int v;
-
- // Start / synch code
- cx_data[0] = 0x9D;
- cx_data[1] = 0x01;
- cx_data[2] = 0x2a;
-
- v = (pc->horiz_scale << 14) | pc->Width;
- cx_data[3] = v;
- cx_data[4] = v >> 8;
-
- v = (pc->vert_scale << 14) | pc->Height;
- cx_data[5] = v;
- cx_data[6] = v >> 8;
-
- extra_bytes_packed = 7;
- cx_data += extra_bytes_packed;
-
- vp9_start_encode(&header_bc, cx_data);
-
- // signal clr type
- vp9_write_bit(&header_bc, pc->clr_type);
- vp9_write_bit(&header_bc, pc->clamp_type);
-
- } else {
- vp9_start_encode(&header_bc, cx_data);
- }
-
- // Signal whether or not Segmentation is enabled
- vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
-
- // Indicate which features are enabled
- if (xd->segmentation_enabled) {
- // Indicate whether or not the segmentation map is being updated.
- vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-
- // If it is, then indicate the method that will be used.
- if (xd->update_mb_segmentation_map) {
- // Select the coding strategy (temporal or spatial)
- vp9_choose_segmap_coding_method(cpi);
- // Send the tree probabilities used to decode unpredicted
- // macro-block segments
- for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
- int data = xd->mb_segment_tree_probs[i];
-
- if (data != 255) {
- vp9_write_bit(&header_bc, 1);
- vp9_write_literal(&header_bc, data, 8);
- } else {
- vp9_write_bit(&header_bc, 0);
- }
- }
-
- // Write out the chosen coding method.
- vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
- if (pc->temporal_update) {
- for (i = 0; i < PREDICTION_PROBS; i++) {
- int data = pc->segment_pred_probs[i];
-
- if (data != 255) {
- vp9_write_bit(&header_bc, 1);
- vp9_write_literal(&header_bc, data, 8);
- } else {
- vp9_write_bit(&header_bc, 0);
- }
- }
- }
- }
-
- vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
-
- // segment_reference_frames(cpi);
-
- if (xd->update_mb_segmentation_data) {
- signed char Data;
-
- vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
-
- // For each segments id...
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
- // For each segmentation codable feature...
- for (j = 0; j < SEG_LVL_MAX; j++) {
- Data = vp9_get_segdata(xd, i, j);
-
- // If the feature is enabled...
- if (vp9_segfeature_active(xd, i, j)) {
- vp9_write_bit(&header_bc, 1);
-
- // Is the segment data signed..
- if (vp9_is_segfeature_signed(j)) {
- // Encode the relevant feature data
- if (Data < 0) {
- Data = - Data;
- vp9_write_literal(&header_bc, Data,
- vp9_seg_feature_data_bits(j));
- vp9_write_bit(&header_bc, 1);
- } else {
- vp9_write_literal(&header_bc, Data,
- vp9_seg_feature_data_bits(j));
- vp9_write_bit(&header_bc, 0);
- }
- }
- // Unsigned data element so no sign bit needed
- else
- vp9_write_literal(&header_bc, Data,
- vp9_seg_feature_data_bits(j));
- } else
- vp9_write_bit(&header_bc, 0);
- }
- }
- }
- }
-
- // Encode the common prediction model status flag probability updates for
- // the reference frame
- update_refpred_stats(cpi);
- if (pc->frame_type != KEY_FRAME) {
- for (i = 0; i < PREDICTION_PROBS; i++) {
- if (cpi->ref_pred_probs_update[i]) {
- vp9_write_bit(&header_bc, 1);
- vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
- } else {
- vp9_write_bit(&header_bc, 0);
- }
- }
- }
-
-#if CONFIG_SUPERBLOCKS
- {
- /* sb mode probability */
- const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
-
- pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
- vp9_write_literal(&header_bc, pc->sb_coded, 8);
- }
-#endif
-
- {
- if (pc->txfm_mode == TX_MODE_SELECT) {
- pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
- cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
- cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
- pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
- } else {
- pc->prob_tx[0] = 128;
- pc->prob_tx[1] = 128;
- }
- vp9_write_literal(&header_bc, pc->txfm_mode, 2);
- if (pc->txfm_mode == TX_MODE_SELECT) {
- vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
- vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
- }
- }
-
- // Encode the loop filter level and type
- vp9_write_bit(&header_bc, pc->filter_type);
- vp9_write_literal(&header_bc, pc->filter_level, 6);
- vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-
- // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
- vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
-
- if (xd->mode_ref_lf_delta_enabled) {
- // Do the deltas need to be updated
- int send_update = xd->mode_ref_lf_delta_update;
-
- vp9_write_bit(&header_bc, send_update);
- if (send_update) {
- int Data;
-
- // Send update
- for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
- Data = xd->ref_lf_deltas[i];
-
- // Frame level data
- if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
- xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
- vp9_write_bit(&header_bc, 1);
-
- if (Data > 0) {
- vp9_write_literal(&header_bc, (Data & 0x3F), 6);
- vp9_write_bit(&header_bc, 0); // sign
- } else {
- Data = -Data;
- vp9_write_literal(&header_bc, (Data & 0x3F), 6);
- vp9_write_bit(&header_bc, 1); // sign
- }
- } else {
- vp9_write_bit(&header_bc, 0);
- }
- }
-
- // Send update
- for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- Data = xd->mode_lf_deltas[i];
-
- if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
- xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
- vp9_write_bit(&header_bc, 1);
-
- if (Data > 0) {
- vp9_write_literal(&header_bc, (Data & 0x3F), 6);
- vp9_write_bit(&header_bc, 0); // sign
- } else {
- Data = -Data;
- vp9_write_literal(&header_bc, (Data & 0x3F), 6);
- vp9_write_bit(&header_bc, 1); // sign
- }
- } else {
- vp9_write_bit(&header_bc, 0);
- }
- }
- }
- }
-
- // signal here is multi token partition is enabled
- // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);
- vp9_write_literal(&header_bc, 0, 2);
-
- // Frame Q baseline quantizer index
- vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
-
- // Transmit Dc, Second order and Uv quantizer delta information
- put_delta_q(&header_bc, pc->y1dc_delta_q);
- put_delta_q(&header_bc, pc->y2dc_delta_q);
- put_delta_q(&header_bc, pc->y2ac_delta_q);
- put_delta_q(&header_bc, pc->uvdc_delta_q);
- put_delta_q(&header_bc, pc->uvac_delta_q);
-
- // When there is a key frame all reference buffers are updated using the new key frame
- if (pc->frame_type != KEY_FRAME) {
- // Should the GF or ARF be updated using the transmitted frame or buffer
- vp9_write_bit(&header_bc, pc->refresh_golden_frame);
- vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);
-
- // For inter frames the current default behavior is that when
- // cm->refresh_golden_frame is set we copy the old GF over to
- // the ARF buffer. This is purely an encoder decision at present.
- if (pc->refresh_golden_frame)
- pc->copy_buffer_to_arf = 2;
-
- // If not being updated from current frame should either GF or ARF be updated from another buffer
- if (!pc->refresh_golden_frame)
- vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
-
- if (!pc->refresh_alt_ref_frame)
- vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
-
- // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
- vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
- vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
-
- // Signal whether to allow high MV precision
- vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
- if (pc->mcomp_filter_type == SWITCHABLE) {
- /* Check to see if only one of the filters is actually used */
- int count[VP9_SWITCHABLE_FILTERS];
- int i, j, c = 0;
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- count[i] = 0;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
- count[i] += cpi->switchable_interp_count[j][i];
- }
- c += (count[i] > 0);
- }
- if (c == 1) {
- /* Only one filter is used. So set the filter at frame level */
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- if (count[i]) {
- pc->mcomp_filter_type = vp9_switchable_interp[i];
- break;
- }
- }
- }
- }
- // Signal the type of subpel filter to use
- vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
- if (pc->mcomp_filter_type != SWITCHABLE)
- vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
- }
-
- vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
-
- if (pc->frame_type != KEY_FRAME)
- vp9_write_bit(&header_bc, pc->refresh_last_frame);
-
-#ifdef ENTROPY_STATS
- if (pc->frame_type == INTER_FRAME)
- active_section = 0;
- else
- active_section = 7;
-#endif
-
- vp9_clear_system_state(); // __asm emms;
-
- vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
- vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);
- vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
- vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
- vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
- vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
- vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
- vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
- vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
- vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
- vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
- vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
- cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
- vp9_zero(cpi->sub_mv_ref_count);
- vp9_zero(cpi->mbsplit_count);
- vp9_zero(cpi->common.fc.mv_ref_ct)
- vp9_zero(cpi->common.fc.mv_ref_ct_a)
-
- update_coef_probs(cpi, &header_bc);
-
-#ifdef ENTROPY_STATS
- active_section = 2;
-#endif
-
- // Write out the mb_no_coeff_skip flag
- vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);
- if (pc->mb_no_coeff_skip) {
- int k;
-
- vp9_update_skip_probs(cpi);
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
- }
-
- if (pc->frame_type == KEY_FRAME) {
- if (!pc->kf_ymode_probs_update) {
- vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
- }
- } else {
- // Update the probabilities used to encode reference frame data
- update_ref_probs(cpi);
-
-#ifdef ENTROPY_STATS
- active_section = 1;
-#endif
-
-#if CONFIG_PRED_FILTER
- // Write the prediction filter mode used for this frame
- vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);
-
- // Write prediction filter on/off probability if signaling at MB level
- if (pc->pred_filter_mode == 2)
- vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
-
-#endif
- if (pc->mcomp_filter_type == SWITCHABLE)
- update_switchable_interp_probs(cpi, &header_bc);
-
- vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);
- vp9_write_literal(&header_bc, pc->prob_last_coded, 8);
- vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);
-
- {
- const int comp_pred_mode = cpi->common.comp_pred_mode;
- const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
- const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
-
- vp9_write(&header_bc, use_compound_pred, 128);
- if (use_compound_pred) {
- vp9_write(&header_bc, use_hybrid_pred, 128);
- if (use_hybrid_pred) {
- for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
- pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
- cpi->comp_pred_count[i]);
- vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);
- }
- }
- }
- }
-
- update_mbintra_mode_probs(cpi, &header_bc);
-
-#if CONFIG_NEW_MVREF
- // Temp defaults probabilities for ecnoding the MV ref id signal
- vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif
-
- vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);
- }
-
- vp9_stop_encode(&header_bc);
-
- oh.first_partition_length_in_bytes = header_bc.pos;
-
- /* update frame tag */
- {
- int v = (oh.first_partition_length_in_bytes << 5) |
- (oh.show_frame << 4) |
- (oh.version << 1) |
- oh.type;
-
- dest[0] = v;
- dest[1] = v >> 8;
- dest[2] = v >> 16;
- }
-
- *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
- vp9_start_encode(&residual_bc, cx_data + header_bc.pos);
-
- if (pc->frame_type == KEY_FRAME) {
- decide_kf_ymode_entropy(cpi);
- write_kfmodes(cpi, &residual_bc);
- } else {
- pack_inter_mode_mvs(cpi, &residual_bc);
- vp9_update_mode_context(&cpi->common);
- }
-
-
- vp9_stop_encode(&residual_bc);
-
- *size += residual_bc.pos;
-
-}
-
-#ifdef ENTROPY_STATS
-void print_tree_update_probs() {
- int i, j, k, l;
- FILE *f = fopen("coefupdprob.h", "w");
- int Sum;
- fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-
- fprintf(f, "const vp9_prob\n"
- "vp9_coef_update_probs[BLOCK_TYPES]\n"
- " [COEF_BANDS]\n"
- " [PREV_COEF_CONTEXTS]\n"
- " [ENTROPY_NODES] = {\n");
- for (i = 0; i < BLOCK_TYPES; i++) {
- fprintf(f, " { \n");
- for (j = 0; j < COEF_BANDS; j++) {
- fprintf(f, " {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- fprintf(f, " {");
- for (l = 0; l < ENTROPY_NODES; l++) {
- fprintf(f, "%3ld, ",
- get_binary_prob(tree_update_hist[i][j][k][l][0],
- tree_update_hist[i][j][k][l][1]));
- }
- fprintf(f, "},\n");
- }
- fprintf(f, " },\n");
- }
- fprintf(f, " },\n");
- }
- fprintf(f, "};\n");
-
- fprintf(f, "const vp9_prob\n"
- "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"
- " [COEF_BANDS]\n"
- " [PREV_COEF_CONTEXTS]\n"
- " [ENTROPY_NODES] = {\n");
- for (i = 0; i < BLOCK_TYPES_8X8; i++) {
- fprintf(f, " { \n");
- for (j = 0; j < COEF_BANDS; j++) {
- fprintf(f, " {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- fprintf(f, " {");
- for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
- fprintf(f, "%3ld, ",
- get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
- tree_update_hist_8x8[i][j][k][l][1]));
- }
- fprintf(f, "},\n");
- }
- fprintf(f, " },\n");
- }
- fprintf(f, " },\n");
- }
-
- fprintf(f, "const vp9_prob\n"
- "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
- " [COEF_BANDS]\n"
- " [PREV_COEF_CONTEXTS]\n"
- " [ENTROPY_NODES] = {\n");
- for (i = 0; i < BLOCK_TYPES_16X16; i++) {
- fprintf(f, " { \n");
- for (j = 0; j < COEF_BANDS; j++) {
- fprintf(f, " {\n");
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- fprintf(f, " {");
- for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
- fprintf(f, "%3ld, ",
- get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
- tree_update_hist_16x16[i][j][k][l][1]));
- }
- fprintf(f, "},\n");
- }
- fprintf(f, " },\n");
- }
- fprintf(f, " },\n");
- }
-
- fclose(f);
- f = fopen("treeupdate.bin", "wb");
- fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
- fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
- fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
- fclose(f);
-}
-#endif
--- a/vp8/encoder/bitstream.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BITSTREAM_H
-#define __INC_BITSTREAM_H
-
-void vp9_update_skip_probs(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/block.h
+++ /dev/null
@@ -1,184 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_BLOCK_H
-#define __INC_BLOCK_H
-
-#include "vp8/common/onyx.h"
-#include "vp8/common/entropymv.h"
-#include "vp8/common/entropy.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/onyxc_int.h"
-
-// motion search site
-typedef struct {
- MV mv;
- int offset;
-} search_site;
-
-typedef struct block {
- // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
- short *src_diff;
- short *coeff;
-
- // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
- short *quant;
- short *quant_fast; // fast quant deprecated for now
- unsigned char *quant_shift;
- short *zbin;
- short *zbin_8x8;
- short *zbin_16x16;
- short *zrun_zbin_boost;
- short *zrun_zbin_boost_8x8;
- short *zrun_zbin_boost_16x16;
- short *round;
-
- // Zbin Over Quant value
- short zbin_extra;
-
- unsigned char **base_src;
- unsigned char **base_second_src;
- int src;
- int src_stride;
-
- int eob_max_offset;
- int eob_max_offset_8x8;
- int eob_max_offset_16x16;
-} BLOCK;
-
-typedef struct {
- int count;
- struct {
- B_PREDICTION_MODE mode;
- int_mv mv;
- int_mv second_mv;
- } bmi[16];
-} PARTITION_INFO;
-
-// Structure to hold snapshot of coding context during the mode picking process
-// TODO Do we need all of these?
-typedef struct {
- MODE_INFO mic;
- PARTITION_INFO partition_info;
- int_mv best_ref_mv;
- int_mv second_best_ref_mv;
-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
- int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
- int rate;
- int distortion;
- int64_t intra_error;
- int best_mode_index;
- int rddiv;
- int rdmult;
- int hybrid_pred_diff;
- int comp_pred_diff;
- int single_pred_diff;
- int64_t txfm_rd_diff[NB_TXFM_MODES];
-} PICK_MODE_CONTEXT;
-
-typedef struct macroblock {
- DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
- DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
- DECLARE_ALIGNED(16, unsigned char, thismb[256]); // 16x16 Y
-
- unsigned char *thismb_ptr;
- // 16 Y blocks, 4 U blocks, 4 V blocks,
- // 1 DC 2nd order block each with 16 entries
- BLOCK block[25];
-
- YV12_BUFFER_CONFIG src;
-
- MACROBLOCKD e_mbd;
- PARTITION_INFO *partition_info; /* work pointer */
- PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
- PARTITION_INFO *pip; /* Base of allocated array */
-
- search_site *ss;
- int ss_count;
- int searches_per_step;
-
- int errorperbit;
- int sadperbit16;
- int sadperbit4;
- int rddiv;
- int rdmult;
- unsigned int *mb_activity_ptr;
- int *mb_norm_activity_ptr;
- signed int act_zbin_adj;
-
- int nmvjointcost[MV_JOINTS];
- int nmvcosts[2][MV_VALS];
- int *nmvcost[2];
- int nmvcosts_hp[2][MV_VALS];
- int *nmvcost_hp[2];
-
- int nmvjointsadcost[MV_JOINTS];
- int nmvsadcosts[2][MV_VALS];
- int *nmvsadcost[2];
- int nmvsadcosts_hp[2][MV_VALS];
- int *nmvsadcost_hp[2];
-
- int mbmode_cost[2][MB_MODE_COUNT];
- int intra_uv_mode_cost[2][MB_MODE_COUNT];
- int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
- int i8x8_mode_costs[MB_MODE_COUNT];
- int inter_bmode_costs[B_MODE_COUNT];
- int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS];
-
- // These define limits to motion vector components to prevent them
- // from extending outside the UMV borders
- int mv_col_min;
- int mv_col_max;
- int mv_row_min;
- int mv_row_max;
-
- int skip;
-
- int encode_breakout;
-
- // char * gf_active_ptr;
- signed char *gf_active_ptr;
-
- unsigned char *active_ptr;
-
- unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
- unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-
- int optimize;
-
- // Structure to hold context for each of the 4 MBs within a SB:
- // when encoded as 4 independent MBs:
- PICK_MODE_CONTEXT mb_context[4];
-#if CONFIG_SUPERBLOCKS
- // when 4 MBs share coding parameters:
- PICK_MODE_CONTEXT sb_context[4];
-#endif
-
- void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
- void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
- void (*short_walsh4x4)(short *input, short *output, int pitch);
- void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
- void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
- void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);
- void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);
- void (*short_fhaar2x2)(short *input, short *output, int pitch);
- void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
- void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
- void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
-
-} MACROBLOCK;
-
-
-#endif
--- a/vp8/encoder/boolhuff.c
+++ /dev/null
@@ -1,153 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "boolhuff.h"
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-
-#endif
-
-#ifdef ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
-
-const unsigned int vp9_prob_cost[256] = {
- 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
- 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
- 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
- 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
- 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
- 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
- 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
- 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
- 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
- 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
- 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
- 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
- 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
- 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
- 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
- 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
-};
-
-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
-
- br->lowvalue = 0;
- br->range = 255;
- br->value = 0;
- br->count = -24;
- br->buffer = source;
- br->pos = 0;
-}
-
-void vp9_stop_encode(BOOL_CODER *br) {
- int i;
-
- for (i = 0; i < 32; i++)
- encode_bool(br, 0, 128);
-}
-
-
-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
- int bit;
-
- for (bit = bits - 1; bit >= 0; bit--)
- encode_bool(br, (1 & (data >> bit)), 0x80);
-}
-
-int vp9_recenter_nonneg(int v, int m) {
- if (v > (m << 1)) return v;
- else if (v >= m) return ((v - m) << 1);
- else return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
- int cat = 0;
- if ((num_values--) <= 1) return 0;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0) return;
- m = (1 << l) - n;
- if (v < m)
- vp9_encode_value(br, v, l - 1);
- else {
- vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
- vp9_encode_value(br, (v - m) & 1, 1);
- }
-}
-
-int vp9_count_uniform(int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0) return 0;
- m = (1 << l) - n;
- if (v < m)
- return l - 1;
- else
- return l;
-}
-
-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- vp9_encode_uniform(br, word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- vp9_encode_value(br, t, 1);
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- vp9_encode_value(br, word - mk, b);
- break;
- }
- }
- }
-}
-
-int vp9_count_term_subexp(int word, int k, int num_syms) {
- int count = 0;
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- count += vp9_count_uniform(word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- count++;
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- count += b;
- break;
- }
- }
- }
- return count;
-}
--- a/vp8/encoder/boolhuff.h
+++ /dev/null
@@ -1,111 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-* Module Title : boolhuff.h
-*
-* Description : Bool Coder header file.
-*
-****************************************************************************/
-#ifndef __INC_BOOLHUFF_H
-#define __INC_BOOLHUFF_H
-
-#include "vpx_ports/mem.h"
-
-typedef struct {
- unsigned int lowvalue;
- unsigned int range;
- unsigned int value;
- int count;
- unsigned int pos;
- unsigned char *buffer;
-
- // Variables used to track bit costs without outputing to the bitstream
- unsigned int measure_cost;
- unsigned long bit_counter;
-} BOOL_CODER;
-
-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
-
-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp9_stop_encode(BOOL_CODER *bc);
-extern const unsigned int vp9_prob_cost[256];
-
-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
-extern int vp9_count_uniform(int v, int n);
-extern int vp9_count_term_subexp(int v, int k, int n);
-extern int vp9_recenter_nonneg(int v, int m);
-
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
-
-static void encode_bool(BOOL_CODER *br, int bit, int probability) {
- unsigned int split;
- int count = br->count;
- unsigned int range = br->range;
- unsigned int lowvalue = br->lowvalue;
- register unsigned int shift;
-
-#ifdef ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
- if (bit)
- Sectionbits[active_section] += vp9_prob_cost[255 - probability];
- else
- Sectionbits[active_section] += vp9_prob_cost[probability];
-
-#endif
-#endif
-
- split = 1 + (((range - 1) * probability) >> 8);
-
- range = split;
-
- if (bit) {
- lowvalue += split;
- range = br->range - split;
- }
-
- shift = vp9_norm[range];
-
- range <<= shift;
- count += shift;
-
- if (count >= 0) {
- int offset = shift - count;
-
- if ((lowvalue << (offset - 1)) & 0x80000000) {
- int x = br->pos - 1;
-
- while (x >= 0 && br->buffer[x] == 0xff) {
- br->buffer[x] = (unsigned char)0;
- x--;
- }
-
- br->buffer[x] += 1;
- }
-
- br->buffer[br->pos++] = (lowvalue >> (24 - offset));
- lowvalue <<= offset;
- shift = count;
- lowvalue &= 0xffffff;
- count -= 8;
- }
-
- lowvalue <<= shift;
- br->count = count;
- br->lowvalue = lowvalue;
- br->range = range;
-}
-
-#endif
--- a/vp8/encoder/dct.c
+++ /dev/null
@@ -1,1109 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <assert.h>
-#include <math.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/idct.h"
-#include "vp8/common/systemdependent.h"
-
-#include "vp8/common/blockd.h"
-
-// TODO: these transforms can be converted into integer forms to reduce
-// the complexity
-static const float dct_4[16] = {
- 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,
- 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,
- 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,
- 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099
-};
-
-static const float adst_4[16] = {
- 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139,
- 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626,
- 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359,
- 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779
-};
-
-static const float dct_8[64] = {
- 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274,
- 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274,
- 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.097545161008064,
- -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.490392640201615,
- 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.461939766255643,
- -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.461939766255643,
- 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.277785116509801,
- 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.415734806151273,
- 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274,
- 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274,
- 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.415734806151273,
- -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.277785116509801,
- 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.191341716182545,
- -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.191341716182545,
- 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.490392640201615,
- 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.097545161008064
-};
-
-static const float adst_8[64] = {
- 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.326790388032145,
- 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.483002021635509,
- 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.387095214016349,
- 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.466553967085785,
- 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.255357107325376,
- -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.434217976756762,
- 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.434217976756762,
- 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.387095214016348,
- 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.175227946595735,
- 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.326790388032145,
- 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.466553967085786,
- -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.255357107325375,
- 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.089131608307534,
- -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.175227946595736,
- 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.483002021635509,
- 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.089131608307532
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i4[16] = {
- 16384, 16384, 16384, 16384,
- 21407, 8867, -8867, -21407,
- 16384, -16384, -16384, 16384,
- 8867, -21407, 21407, -8867
-};
-
-static const int16_t adst_i4[16] = {
- 7472, 14042, 18919, 21513,
- 18919, 18919, 0, -18919,
- 21513, -7472, -18919, 14042,
- 14042, -21513, 18919, -7472
-};
-
-static const int16_t dct_i8[64] = {
- 11585, 11585, 11585, 11585,
- 11585, 11585, 11585, 11585,
- 16069, 13623, 9102, 3196,
- -3196, -9102, -13623, -16069,
- 15137, 6270, -6270, -15137,
- -15137, -6270, 6270, 15137,
- 13623, -3196, -16069, -9102,
- 9102, 16069, 3196, -13623,
- 11585, -11585, -11585, 11585,
- 11585, -11585, -11585, 11585,
- 9102, -16069, 3196, 13623,
- -13623, -3196, 16069, -9102,
- 6270, -15137, 15137, -6270,
- -6270, 15137, -15137, 6270,
- 3196, -9102, 13623, -16069,
- 16069, -13623, 9102, -3196
-};
-
-static const int16_t adst_i8[64] = {
- 2921, 5742, 8368, 10708,
- 12684, 14228, 15288, 15827,
- 8368, 14228, 15827, 12684,
- 5742, -2921, -10708, -15288,
- 12684, 15288, 5742, -8368,
- -15827, -10708, 2921, 14228,
- 15288, 8368, -10708, -14228,
- 2921, 15827, 5742, -12684,
- 15827, -2921, -15288, 5742,
- 14228, -8368, -12684, 10708,
- 14228, -12684, -2921, 15288,
- -10708, -5742, 15827, -8368,
- 10708, -15827, 12684, -2921,
- -8368, 15288, -14228, 5742,
- 5742, -10708, 14228, -15827,
- 15288, -12684, 8368, -2921
-};
-
-static const float dct_16[256] = {
- 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
- 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
- 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0.034654,
- -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
- 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
- -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0.346760,
- 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
- 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0.338330,
- 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
- 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
- 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0.166664,
- -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0.311806,
- 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0.293969,
- -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0.293969,
- 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0.224292,
- 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0.273300,
- 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000,
- 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000,
- 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0.273300,
- -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0.224292,
- 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0.196424,
- -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0.196424,
- 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0.311806,
- 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0.166664,
- 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299,
- 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299,
- 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0.338330,
- -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0.102631,
- 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0.068975,
- -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0.068975,
- 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0.351851,
- 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0.034654
-};
-
-static const float adst_16[256] = {
- 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0.240255,
- 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0.347761,
- 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0.263118,
- 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,
- 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0.215215,
- -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0.338341,
- 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0.283599,
- -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0.329007,
- 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0.188227,
- 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0.316693,
- 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511,
- 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0.301511,
- 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0.159534,
- -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0.283599,
- 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0.316693,
- 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0.263118,
- 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0.129396,
- 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0.240255,
- 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0.329007,
- -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0.215215,
- 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0.098087,
- -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0.188227,
- 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0.338341,
- 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0.159534,
- 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0.065889,
- 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0.129396,
- 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0.344612,
- -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0.098087,
- 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0.033094,
- -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0.065889,
- 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0.347761,
- 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0.033094
-};
-
-/* Converted the transforms to integers. */
-static const int16_t dct_i16[256] = {
- 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
- 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
- 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136,
- -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529,
- 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363,
- -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363,
- 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363,
- 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086,
- 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
- 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
- 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461,
- -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217,
- 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633,
- -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633,
- 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350,
- 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955,
- 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
- 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
- 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955,
- -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350,
- 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436,
- -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436,
- 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217,
- 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461,
- 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
- 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
- 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086,
- -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363,
- 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260,
- -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260,
- 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529,
- 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136
-};
-
-static const int16_t adst_i16[256] = {
- 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873,
- 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395,
- 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622,
- 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292,
- 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052,
- -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087,
- 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293,
- -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781,
- 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168,
- 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377,
- 9880, 9880, 0, -9880, -9880, 0, 9880, 9880,
- 0, -9880, -9880, 0, 9880, 9880, 0, -9880,
- 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228,
- -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293,
- 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377,
- 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622,
- 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240,
- 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873,
- 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781,
- -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052,
- 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214,
- -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168,
- 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087,
- 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228,
- 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159,
- 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240,
- 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292,
- -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214,
- 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084,
- -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159,
- 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395,
- 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084
-};
-
-static const int xC1S7 = 16069;
-static const int xC2S6 = 15137;
-static const int xC3S5 = 13623;
-static const int xC4S4 = 11585;
-static const int xC5S3 = 9102;
-static const int xC6S2 = 6270;
-static const int xC7S1 = 3196;
-
-#define SHIFT_BITS 14
-#define DOROUND(X) X += (1<<(SHIFT_BITS-1));
-
-#define FINAL_SHIFT 3
-#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
-#define IN_SHIFT (FINAL_SHIFT+1)
-
-
-void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
- int loop;
- int short_pitch = pitch >> 1;
- int is07, is12, is34, is56;
- int is0734, is1256;
- int id07, id12, id34, id56;
- int irot_input_x, irot_input_y;
- int icommon_product1; // Re-used product (c4s4 * (s12 - s56))
- int icommon_product2; // Re-used product (c4s4 * (d12 + d56))
- int temp1, temp2; // intermediate variable for computation
-
- int InterData[64];
- int *ip = InterData;
- short *op = OutputData;
-
- for (loop = 0; loop < 8; loop++) {
- // Pre calculate some common sums and differences.
- is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
- is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
- is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
- is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
- id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
- id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
- id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
- id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
-
- is0734 = is07 + is34;
- is1256 = is12 + is56;
-
- // Pre-Calculate some common product terms.
- icommon_product1 = xC4S4 * (is12 - is56);
- DOROUND(icommon_product1)
- icommon_product1 >>= SHIFT_BITS;
-
- icommon_product2 = xC4S4 * (id12 + id56);
- DOROUND(icommon_product2)
- icommon_product2 >>= SHIFT_BITS;
-
-
- ip[0] = (xC4S4 * (is0734 + is1256));
- DOROUND(ip[0]);
- ip[0] >>= SHIFT_BITS;
-
- ip[4] = (xC4S4 * (is0734 - is1256));
- DOROUND(ip[4]);
- ip[4] >>= SHIFT_BITS;
-
- // Define inputs to rotation for outputs 2 and 6
- irot_input_x = id12 - id56;
- irot_input_y = is07 - is34;
-
- // Apply rotation for outputs 2 and 6.
- temp1 = xC6S2 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC2S6 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[2] = temp1 + temp2;
-
- temp1 = xC6S2 * irot_input_y;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC2S6 * irot_input_x;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[6] = temp1 - temp2;
-
- // Define inputs to rotation for outputs 1 and 7
- irot_input_x = icommon_product1 + id07;
- irot_input_y = -(id34 + icommon_product2);
-
- // Apply rotation for outputs 1 and 7.
- temp1 = xC1S7 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC7S1 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[1] = temp1 - temp2;
-
- temp1 = xC7S1 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC1S7 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[7] = temp1 + temp2;
-
- // Define inputs to rotation for outputs 3 and 5
- irot_input_x = id07 - icommon_product1;
- irot_input_y = id34 - icommon_product2;
-
- // Apply rotation for outputs 3 and 5.
- temp1 = xC3S5 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC5S3 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[3] = temp1 - temp2;
-
-
- temp1 = xC5S3 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC3S5 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- ip[5] = temp1 + temp2;
-
- // Increment data pointer for next row
- InputData += short_pitch;
- ip += 8;
- }
-
- // Performed DCT on rows, now transform the columns
- ip = InterData;
- for (loop = 0; loop < 8; loop++) {
- // Pre calculate some common sums and differences.
- is07 = ip[0 * 8] + ip[7 * 8];
- is12 = ip[1 * 8] + ip[2 * 8];
- is34 = ip[3 * 8] + ip[4 * 8];
- is56 = ip[5 * 8] + ip[6 * 8];
-
- id07 = ip[0 * 8] - ip[7 * 8];
- id12 = ip[1 * 8] - ip[2 * 8];
- id34 = ip[3 * 8] - ip[4 * 8];
- id56 = ip[5 * 8] - ip[6 * 8];
-
- is0734 = is07 + is34;
- is1256 = is12 + is56;
-
- // Pre-Calculate some common product terms
- icommon_product1 = xC4S4 * (is12 - is56);
- icommon_product2 = xC4S4 * (id12 + id56);
- DOROUND(icommon_product1)
- DOROUND(icommon_product2)
- icommon_product1 >>= SHIFT_BITS;
- icommon_product2 >>= SHIFT_BITS;
-
-
- temp1 = xC4S4 * (is0734 + is1256);
- temp2 = xC4S4 * (is0734 - is1256);
- DOROUND(temp1);
- DOROUND(temp2);
- temp1 >>= SHIFT_BITS;
-
- temp2 >>= SHIFT_BITS;
- op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
- op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- // Define inputs to rotation for outputs 2 and 6
- irot_input_x = id12 - id56;
- irot_input_y = is07 - is34;
-
- // Apply rotation for outputs 2 and 6.
- temp1 = xC6S2 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC2S6 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- temp1 = xC6S2 * irot_input_y;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC2S6 * irot_input_x;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- // Define inputs to rotation for outputs 1 and 7
- irot_input_x = icommon_product1 + id07;
- irot_input_y = -(id34 + icommon_product2);
-
- // Apply rotation for outputs 1 and 7.
- temp1 = xC1S7 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC7S1 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- temp1 = xC7S1 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC1S7 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- // Define inputs to rotation for outputs 3 and 5
- irot_input_x = id07 - icommon_product1;
- irot_input_y = id34 - icommon_product2;
-
- // Apply rotation for outputs 3 and 5.
- temp1 = xC3S5 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC5S3 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
-
- temp1 = xC5S3 * irot_input_x;
- DOROUND(temp1);
- temp1 >>= SHIFT_BITS;
- temp2 = xC3S5 * irot_input_y;
- DOROUND(temp2);
- temp2 >>= SHIFT_BITS;
- op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
-
- // Increment data pointer for next column.
- ip++;
- op++;
- }
-}
-
-void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
- /* [1 1; 1 -1] orthogonal transform */
- /* use position: 0,1, 4, 8 */
- int i;
- short *ip1 = input;
- short *op1 = output;
- for (i = 0; i < 16; i++) {
- op1[i] = 0;
- }
-
- op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
- op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
- op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
- op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
-}
-
-/* For test */
-#define TEST_INT 1
-#if TEST_INT
-#define vp9_fht_int_c vp9_fht_c
-#else
-#define vp9_fht_float_c vp9_fht_c
-#endif
-
-void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
- TX_TYPE tx_type, int tx_dim) {
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- int i, j, k;
- float bufa[256], bufb[256]; // buffers are for floating-point test purpose
- // the implementation could be simplified in
- // conjunction with integer transform
- const int16_t *ip = input;
- int16_t *op = output;
-
- float *pfa = &bufa[0];
- float *pfb = &bufb[0];
-
- // pointers to vertical and horizontal transforms
- const float *ptv, *pth;
-
- assert(tx_type != DCT_DCT);
- // load and convert residual array into floating-point
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- pfa[i] = (float)ip[i];
- }
- pfa += tx_dim;
- ip += pitch / 2;
- }
-
- // vertical transformation
- pfa = &bufa[0];
- pfb = &bufb[0];
-
- switch (tx_type) {
- case ADST_ADST :
- case ADST_DCT :
- ptv = (tx_dim == 4) ? &adst_4[0] :
- ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
- break;
-
- default :
- ptv = (tx_dim == 4) ? &dct_4[0] :
- ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
- break;
- }
-
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- pfb[i] = 0;
- for (k = 0; k < tx_dim; k++) {
- pfb[i] += ptv[k] * pfa[(k * tx_dim)];
- }
- pfa += 1;
- }
- pfb += tx_dim;
- ptv += tx_dim;
- pfa = &bufa[0];
- }
-
- // horizontal transformation
- pfa = &bufa[0];
- pfb = &bufb[0];
-
- switch (tx_type) {
- case ADST_ADST :
- case DCT_ADST :
- pth = (tx_dim == 4) ? &adst_4[0] :
- ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
- break;
-
- default :
- pth = (tx_dim == 4) ? &dct_4[0] :
- ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
- break;
- }
-
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- pfa[i] = 0;
- for (k = 0; k < tx_dim; k++) {
- pfa[i] += pfb[k] * pth[k];
- }
- pth += tx_dim;
- }
-
- pfa += tx_dim;
- pfb += tx_dim;
- // pth -= tx_dim * tx_dim;
-
- switch (tx_type) {
- case ADST_ADST :
- case DCT_ADST :
- pth = (tx_dim == 4) ? &adst_4[0] :
- ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
- break;
-
- default :
- pth = (tx_dim == 4) ? &dct_4[0] :
- ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
- break;
- }
- }
-
- // convert to short integer format and load BLOCKD buffer
- op = output;
- pfa = &bufa[0];
-
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
- -(int16_t)(- 8 * pfa[i] + 0.49);
- }
- op += tx_dim;
- pfa += tx_dim;
- }
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-/* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 11
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 16
-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
-void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
- TX_TYPE tx_type, int tx_dim) {
- int i, j, k;
- int16_t imbuf[256];
-
- const int16_t *ip = input;
- int16_t *op = output;
- int16_t *im = &imbuf[0];
-
- /* pointers to vertical and horizontal transforms. */
- const int16_t *ptv = NULL, *pth = NULL;
-
- switch (tx_type) {
- case ADST_ADST :
- ptv = pth = (tx_dim == 4) ? &adst_i4[0]
- : ((tx_dim == 8) ? &adst_i8[0]
- : &adst_i16[0]);
- break;
- case ADST_DCT :
- ptv = (tx_dim == 4) ? &adst_i4[0]
- : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
- pth = (tx_dim == 4) ? &dct_i4[0]
- : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
- break;
- case DCT_ADST :
- ptv = (tx_dim == 4) ? &dct_i4[0]
- : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
- pth = (tx_dim == 4) ? &adst_i4[0]
- : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
- break;
- case DCT_DCT :
- ptv = pth = (tx_dim == 4) ? &dct_i4[0]
- : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
- break;
- default:
- assert(0);
- break;
- }
-
- /* vertical transformation */
- for (j = 0; j < tx_dim; j++) {
- for (i = 0; i < tx_dim; i++) {
- int temp = 0;
-
- for (k = 0; k < tx_dim; k++) {
- temp += ptv[k] * ip[(k * (pitch >> 1))];
- }
-
- im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
- ip++;
- }
- im += tx_dim; // 16
- ptv += tx_dim;
- ip = input;
- }
-
- /* horizontal transformation */
- im = &imbuf[0];
-
- for (j = 0; j < tx_dim; j++) {
- const int16_t *pthc = pth;
-
- for (i = 0; i < tx_dim; i++) {
- int temp = 0;
-
- for (k = 0; k < tx_dim; k++) {
- temp += im[k] * pthc[k];
- }
-
- op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
- pthc += tx_dim;
- }
-
- im += tx_dim; // 16
- op += tx_dim;
- }
-}
-
-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ((ip[0] + ip[3]) << 5);
- b1 = ((ip[1] + ip[2]) << 5);
- c1 = ((ip[1] - ip[2]) << 5);
- d1 = ((ip[0] - ip[3]) << 5);
-
- op[0] = a1 + b1;
- op[2] = a1 - b1;
-
- op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
- op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
-
- ip += pitch / 2;
- op += 4;
-
- }
- ip = output;
- op = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[12];
- b1 = ip[4] + ip[8];
- c1 = ip[4] - ip[8];
- d1 = ip[0] - ip[12];
-
- op[0] = (a1 + b1 + 7) >> 4;
- op[8] = (a1 - b1 + 7) >> 4;
-
- op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
- op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
-
- ip++;
- op++;
- }
-}
-
-void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
-{
- vp9_short_fdct4x4_c(input, output, pitch);
- vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
- int pitch_short = pitch >> 1;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
- b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
- c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
- d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[4] = (c1 + d1) >> 1;
- op[8] = (a1 - b1) >> 1;
- op[12] = (d1 - c1) >> 1;
-
- ip++;
- op++;
- }
- ip = output;
- op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[3];
- b1 = ip[1] + ip[2];
- c1 = ip[1] - ip[2];
- d1 = ip[0] - ip[3];
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[1] = (c1 + d1) >> 1;
- op[2] = (a1 - b1) >> 1;
- op[3] = (d1 - c1) >> 1;
-
- ip += 4;
- op += 4;
- }
-}
-
-#if CONFIG_LOSSLESS
-void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
- int pitch_short = pitch >> 1;
-
- for (i = 0; i < 4; i++) {
- a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
- b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
- c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
- d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[4] = (c1 + d1) >> 1;
- op[8] = (a1 - b1) >> 1;
- op[12] = (d1 - c1) >> 1;
-
- ip++;
- op++;
- }
- ip = output;
- op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[3];
- b1 = ip[1] + ip[2];
- c1 = ip[1] - ip[2];
- d1 = ip[0] - ip[3];
-
- op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
- op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
-
- ip += 4;
- op += 4;
- }
-}
-
-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
- int i;
- int a1, b1, c1, d1;
- short *ip = input;
- short *op = output;
- int pitch_short = pitch >> 1;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
- b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
- c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
- d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
-
- op[0] = (a1 + b1 + 1) >> 1;
- op[4] = (c1 + d1) >> 1;
- op[8] = (a1 - b1) >> 1;
- op[12] = (d1 - c1) >> 1;
-
- ip++;
- op++;
- }
- ip = output;
- op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] + ip[3];
- b1 = ip[1] + ip[2];
- c1 = ip[1] - ip[2];
- d1 = ip[0] - ip[3];
-
- op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
- op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
- op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
- op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
-
- ip += 4;
- op += 4;
- }
-}
-
-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
- vp9_short_walsh4x4_x8_c(input, output, pitch);
- vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
-}
-#endif
-
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
-
-static void dct16x16_1d(double input[16], double output[16]) {
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- double step[16];
- double intermediate[16];
- double temp1, temp2;
-
- // step 1
- step[ 0] = input[0] + input[15];
- step[ 1] = input[1] + input[14];
- step[ 2] = input[2] + input[13];
- step[ 3] = input[3] + input[12];
- step[ 4] = input[4] + input[11];
- step[ 5] = input[5] + input[10];
- step[ 6] = input[6] + input[ 9];
- step[ 7] = input[7] + input[ 8];
- step[ 8] = input[7] - input[ 8];
- step[ 9] = input[6] - input[ 9];
- step[10] = input[5] - input[10];
- step[11] = input[4] - input[11];
- step[12] = input[3] - input[12];
- step[13] = input[2] - input[13];
- step[14] = input[1] - input[14];
- step[15] = input[0] - input[15];
-
- // step 2
- output[0] = step[0] + step[7];
- output[1] = step[1] + step[6];
- output[2] = step[2] + step[5];
- output[3] = step[3] + step[4];
- output[4] = step[3] - step[4];
- output[5] = step[2] - step[5];
- output[6] = step[1] - step[6];
- output[7] = step[0] - step[7];
-
- temp1 = step[ 8]*C7;
- temp2 = step[15]*C9;
- output[ 8] = temp1 + temp2;
-
- temp1 = step[ 9]*C11;
- temp2 = step[14]*C5;
- output[ 9] = temp1 - temp2;
-
- temp1 = step[10]*C3;
- temp2 = step[13]*C13;
- output[10] = temp1 + temp2;
-
- temp1 = step[11]*C15;
- temp2 = step[12]*C1;
- output[11] = temp1 - temp2;
-
- temp1 = step[11]*C1;
- temp2 = step[12]*C15;
- output[12] = temp2 + temp1;
-
- temp1 = step[10]*C13;
- temp2 = step[13]*C3;
- output[13] = temp2 - temp1;
-
- temp1 = step[ 9]*C5;
- temp2 = step[14]*C11;
- output[14] = temp2 + temp1;
-
- temp1 = step[ 8]*C9;
- temp2 = step[15]*C7;
- output[15] = temp2 - temp1;
-
- // step 3
- step[ 0] = output[0] + output[3];
- step[ 1] = output[1] + output[2];
- step[ 2] = output[1] - output[2];
- step[ 3] = output[0] - output[3];
-
- temp1 = output[4]*C14;
- temp2 = output[7]*C2;
- step[ 4] = temp1 + temp2;
-
- temp1 = output[5]*C10;
- temp2 = output[6]*C6;
- step[ 5] = temp1 + temp2;
-
- temp1 = output[5]*C6;
- temp2 = output[6]*C10;
- step[ 6] = temp2 - temp1;
-
- temp1 = output[4]*C2;
- temp2 = output[7]*C14;
- step[ 7] = temp2 - temp1;
-
- step[ 8] = output[ 8] + output[11];
- step[ 9] = output[ 9] + output[10];
- step[10] = output[ 9] - output[10];
- step[11] = output[ 8] - output[11];
-
- step[12] = output[12] + output[15];
- step[13] = output[13] + output[14];
- step[14] = output[13] - output[14];
- step[15] = output[12] - output[15];
-
- // step 4
- output[ 0] = (step[ 0] + step[ 1]);
- output[ 8] = (step[ 0] - step[ 1]);
-
- temp1 = step[2]*C12;
- temp2 = step[3]*C4;
- temp1 = temp1 + temp2;
- output[ 4] = 2*(temp1*C8);
-
- temp1 = step[2]*C4;
- temp2 = step[3]*C12;
- temp1 = temp2 - temp1;
- output[12] = 2*(temp1*C8);
-
- output[ 2] = 2*((step[4] + step[ 5])*C8);
- output[14] = 2*((step[7] - step[ 6])*C8);
-
- temp1 = step[4] - step[5];
- temp2 = step[6] + step[7];
- output[ 6] = (temp1 + temp2);
- output[10] = (temp1 - temp2);
-
- intermediate[8] = step[8] + step[14];
- intermediate[9] = step[9] + step[15];
-
- temp1 = intermediate[8]*C12;
- temp2 = intermediate[9]*C4;
- temp1 = temp1 - temp2;
- output[3] = 2*(temp1*C8);
-
- temp1 = intermediate[8]*C4;
- temp2 = intermediate[9]*C12;
- temp1 = temp2 + temp1;
- output[13] = 2*(temp1*C8);
-
- output[ 9] = 2*((step[10] + step[11])*C8);
-
- intermediate[11] = step[10] - step[11];
- intermediate[12] = step[12] + step[13];
- intermediate[13] = step[12] - step[13];
- intermediate[14] = step[ 8] - step[14];
- intermediate[15] = step[ 9] - step[15];
-
- output[15] = (intermediate[11] + intermediate[12]);
- output[ 1] = -(intermediate[11] - intermediate[12]);
-
- output[ 7] = 2*(intermediate[13]*C8);
-
- temp1 = intermediate[14]*C12;
- temp2 = intermediate[15]*C4;
- temp1 = temp1 - temp2;
- output[11] = -2*(temp1*C8);
-
- temp1 = intermediate[14]*C4;
- temp2 = intermediate[15]*C12;
- temp1 = temp2 + temp1;
- output[ 5] = 2*(temp1*C8);
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
-
-void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
- {
- int shortpitch = pitch >> 1;
- int i, j;
- double output[256];
- // First transform columns
- for (i = 0; i < 16; i++) {
- double temp_in[16], temp_out[16];
- for (j = 0; j < 16; j++)
- temp_in[j] = input[j*shortpitch + i];
- dct16x16_1d(temp_in, temp_out);
- for (j = 0; j < 16; j++)
- output[j*16 + i] = temp_out[j];
- }
- // Then transform rows
- for (i = 0; i < 16; ++i) {
- double temp_in[16], temp_out[16];
- for (j = 0; j < 16; ++j)
- temp_in[j] = output[j + i*16];
- dct16x16_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- output[j + i*16] = temp_out[j];
- }
- // Scale by some magic number
- for (i = 0; i < 256; i++)
- out[i] = (short)round(output[i]/2);
- }
- vp9_clear_system_state(); // Make it simd safe : __asm emms;
-}
--- a/vp8/encoder/encodeframe.c
+++ /dev/null
@@ -1,2342 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "vp8/common/reconintra4x4.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/common/invtrans.h"
-#include "rdopt.h"
-#include "vp8/common/findnearmv.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/seg_common.h"
-#include "vpx_rtcd.h"
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include "vp8/common/subpixel.h"
-#include "vpx_ports/vpx_timer.h"
-#include "vp8/common/pred_common.h"
-
-#define DBG_PRNT_SEGMAP 0
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define RTCD(x) &cpi->common.rtcd.x
-#define IF_RTCD(x) (x)
-#else
-#define RTCD(x) NULL
-#define IF_RTCD(x) NULL
-#endif
-
-#ifdef ENC_DEBUG
-int enc_debug = 0;
-int mb_row_debug, mb_col_debug;
-#endif
-
-extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);
-
-extern void vp9_auto_select_speed(VP9_COMP *cpi);
-
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset, int recon_uvoffset,
- int *returnrate, int *returndistortion);
-
-extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset,
- int recon_uvoffset, int *r, int *d);
-
-void vp9_build_block_offsets(MACROBLOCK *x);
-
-void vp9_setup_block_ptrs(MACROBLOCK *x);
-
-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
- int recon_yoffset, int recon_uvoffset,
- int output_enabled);
-
-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
- int recon_yoffset, int recon_uvoffset,
- int mb_col, int mb_row);
-
-void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t, int output_enabled);
-
-void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t, int mb_col);
-
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
-
-#ifdef MODE_STATS
-unsigned int inter_y_modes[MB_MODE_COUNT];
-unsigned int inter_uv_modes[VP9_UV_MODES];
-unsigned int inter_b_modes[B_MODE_COUNT];
-unsigned int y_modes[VP9_YMODES];
-unsigned int i8x8_modes[VP9_I8X8_MODES];
-unsigned int uv_modes[VP9_UV_MODES];
-unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-unsigned int b_modes[B_MODE_COUNT];
-#endif
-
-
-/* activity_avg must be positive, or flat regions could get a zero weight
- * (infinite lambda), which confounds analysis.
- * This also avoids the need for divide by zero checks in
- * vp9_activity_masking().
- */
-#define VP9_ACTIVITY_AVG_MIN (64)
-
-/* This is used as a reference when computing the source variance for the
- * purposes of activity masking.
- * Eventually this should be replaced by custom no-reference routines,
- * which will be faster.
- */
-static const unsigned char VP9_VAR_OFFS[16] = {
- 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
-
-// Original activity measure from Tim T's code.
-static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
- unsigned int act;
- unsigned int sse;
- /* TODO: This could also be done over smaller areas (8x8), but that would
- * require extensive changes elsewhere, as lambda is assumed to be fixed
- * over an entire MB in most of the code.
- * Another option is to compute four 8x8 variances, and pick a single
- * lambda using a non-linear combination (e.g., the smallest, or second
- * smallest, etc.).
- */
- act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
- &sse);
- act = act << 4;
-
- /* If the region is flat, lower the activity some more. */
- if (act < 8 << 12)
- act = act < 5 << 12 ? act : 5 << 12;
-
- return act;
-}
-
-// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
- MACROBLOCK *x, int use_dc_pred) {
- return vp9_encode_intra(cpi, x, use_dc_pred);
-}
-
-
-// Measure the activity of the current macroblock
-// What we measure here is TBD so abstracted to this function
-#define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
- int mb_row, int mb_col) {
- unsigned int mb_activity;
-
- if (ALT_ACT_MEASURE) {
- int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-
- // Or use and alternative.
- mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
- } else {
- // Original activity measure from Tim T's code.
- mb_activity = tt_activity_measure(cpi, x);
- }
-
- if (mb_activity < VP9_ACTIVITY_AVG_MIN)
- mb_activity = VP9_ACTIVITY_AVG_MIN;
-
- return mb_activity;
-}
-
-// Calculate an "average" mb activity value for the frame
-#define ACT_MEDIAN 0
-static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
-#if ACT_MEDIAN
- // Find median: Simple n^2 algorithm for experimentation
- {
- unsigned int median;
- unsigned int i, j;
- unsigned int *sortlist;
- unsigned int tmp;
-
- // Create a list to sort to
- CHECK_MEM_ERROR(sortlist,
- vpx_calloc(sizeof(unsigned int),
- cpi->common.MBs));
-
- // Copy map to sort list
- vpx_memcpy(sortlist, cpi->mb_activity_map,
- sizeof(unsigned int) * cpi->common.MBs);
-
-
- // Ripple each value down to its correct position
- for (i = 1; i < cpi->common.MBs; i ++) {
- for (j = i; j > 0; j --) {
- if (sortlist[j] < sortlist[j - 1]) {
- // Swap values
- tmp = sortlist[j - 1];
- sortlist[j - 1] = sortlist[j];
- sortlist[j] = tmp;
- } else
- break;
- }
- }
-
- // Even number MBs so estimate median as mean of two either side.
- median = (1 + sortlist[cpi->common.MBs >> 1] +
- sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
-
- cpi->activity_avg = median;
-
- vpx_free(sortlist);
- }
-#else
- // Simple mean for now
- cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
-#endif
-
- if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
- cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
-
- // Experimental code: return fixed value normalized for several clips
- if (ALT_ACT_MEASURE)
- cpi->activity_avg = 100000;
-}
-
-#define USE_ACT_INDEX 0
-#define OUTPUT_NORM_ACT_STATS 0
-
-#if USE_ACT_INDEX
-// Calculate and activity index for each mb
-static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
- VP9_COMMON *const cm = &cpi->common;
- int mb_row, mb_col;
-
- int64_t act;
- int64_t a;
- int64_t b;
-
-#if OUTPUT_NORM_ACT_STATS
- FILE *f = fopen("norm_act.stt", "a");
- fprintf(f, "\n%12d\n", cpi->activity_avg);
-#endif
-
- // Reset pointers to start of activity map
- x->mb_activity_ptr = cpi->mb_activity_map;
-
- // Calculate normalized mb activity number.
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- // for each macroblock col in image
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- // Read activity from the map
- act = *(x->mb_activity_ptr);
-
- // Calculate a normalized activity number
- a = act + 4 * cpi->activity_avg;
- b = 4 * act + cpi->activity_avg;
-
- if (b >= a)
- *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
- else
- *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
-
-#if OUTPUT_NORM_ACT_STATS
- fprintf(f, " %6d", *(x->mb_activity_ptr));
-#endif
- // Increment activity map pointers
- x->mb_activity_ptr++;
- }
-
-#if OUTPUT_NORM_ACT_STATS
- fprintf(f, "\n");
-#endif
-
- }
-
-#if OUTPUT_NORM_ACT_STATS
- fclose(f);
-#endif
-
-}
-#endif
-
-// Loop through all MBs. Note activity of each, average activity and
-// calculate a normalized activity for each
-static void build_activity_map(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *xd = &x->e_mbd;
- VP9_COMMON *const cm = &cpi->common;
-
-#if ALT_ACT_MEASURE
- YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
- int recon_yoffset;
- int recon_y_stride = new_yv12->y_stride;
-#endif
-
- int mb_row, mb_col;
- unsigned int mb_activity;
- int64_t activity_sum = 0;
-
- // for each macroblock row in image
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-#if ALT_ACT_MEASURE
- // reset above block coeffs
- xd->up_available = (mb_row != 0);
- recon_yoffset = (mb_row * recon_y_stride * 16);
-#endif
- // for each macroblock col in image
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-#if ALT_ACT_MEASURE
- xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
- xd->left_available = (mb_col != 0);
- recon_yoffset += 16;
-#endif
- // Copy current mb to a buffer
- vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
- // measure activity
- mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
-
- // Keep frame sum
- activity_sum += mb_activity;
-
- // Store MB level activity details.
- *x->mb_activity_ptr = mb_activity;
-
- // Increment activity map pointer
- x->mb_activity_ptr++;
-
- // adjust to the next column of source macroblocks
- x->src.y_buffer += 16;
- }
-
-
- // adjust to the next row of mbs
- x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-
-#if ALT_ACT_MEASURE
- // extend the recon for intra prediction
- vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
- xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-#endif
-
- }
-
- // Calculate an "average" MB activity
- calc_av_activity(cpi, activity_sum);
-
-#if USE_ACT_INDEX
- // Calculate an activity index number of each mb
- calc_activity_index(cpi, x);
-#endif
-
-}
-
-// Macroblock activity masking
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
- x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
- x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
- x->errorperbit += (x->errorperbit == 0);
-#else
- int64_t a;
- int64_t b;
- int64_t act = *(x->mb_activity_ptr);
-
- // Apply the masking to the RD multiplier.
- a = act + (2 * cpi->activity_avg);
- b = (2 * act) + cpi->activity_avg;
-
- x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
- x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
- x->errorperbit += (x->errorperbit == 0);
-#endif
-
- // Activity based Zbin adjustment
- adjust_act_zbin(cpi, x);
-}
-
-static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
- int i;
- MACROBLOCKD *xd = &x->e_mbd;
- MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
- int mb_mode = mi->mbmi.mode;
- int mb_mode_index = ctx->best_mode_index;
-
-#if CONFIG_DEBUG
- assert(mb_mode < MB_MODE_COUNT);
- assert(mb_mode_index < MAX_MODES);
- assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
-#endif
-
- // Restore the coding context of the MB to that that was in place
- // when the mode was picked for it
- vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
-#if CONFIG_SUPERBLOCKS
- if (mi->mbmi.encoded_as_sb) {
- const int mis = cpi->common.mode_info_stride;
- if (xd->mb_to_right_edge > 0)
- vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
- if (xd->mb_to_bottom_edge > 0) {
- vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
- if (xd->mb_to_right_edge > 0)
- vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
- }
- }
-#endif
-
- if (mb_mode == B_PRED) {
- for (i = 0; i < 16; i++) {
- xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
- assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);
- }
- } else if (mb_mode == I8X8_PRED) {
- for (i = 0; i < 16; i++) {
- xd->block[i].bmi = xd->mode_info_context->bmi[i];
- }
- } else if (mb_mode == SPLITMV) {
- vpx_memcpy(x->partition_info, &ctx->partition_info,
- sizeof(PARTITION_INFO));
-
- mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
- mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
- }
-
- {
- int segment_id = mbmi->segment_id;
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {
- for (i = 0; i < NB_TXFM_MODES; i++) {
- cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
- }
- }
- }
-
- if (cpi->common.frame_type == KEY_FRAME) {
- // Restore the coding modes to that held in the coding context
- // if (mb_mode == B_PRED)
- // for (i = 0; i < 16; i++)
- // {
- // xd->block[i].bmi.as_mode =
- // xd->mode_info_context->bmi[i].as_mode;
- // assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
- // }
-#if CONFIG_INTERNAL_STATS
- static const int kf_mode_index[] = {
- THR_DC /*DC_PRED*/,
- THR_V_PRED /*V_PRED*/,
- THR_H_PRED /*H_PRED*/,
- THR_D45_PRED /*D45_PRED*/,
- THR_D135_PRED /*D135_PRED*/,
- THR_D117_PRED /*D117_PRED*/,
- THR_D153_PRED /*D153_PRED*/,
- THR_D27_PRED /*D27_PRED*/,
- THR_D63_PRED /*D63_PRED*/,
- THR_TM /*TM_PRED*/,
- THR_I8X8_PRED /*I8X8_PRED*/,
- THR_B_PRED /*B_PRED*/,
- };
- cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
-#endif
- } else {
- /*
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
- {
- int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
- cpi->rd_thresh_mult[mb_mode_index] =
- (cpi->rd_thresh_mult[mb_mode_index]
- >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
- MIN_THRESHMULT;
- cpi->rd_threshes[mb_mode_index] =
- (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
- * cpi->rd_thresh_mult[mb_mode_index];
-
- }
- */
- // Note how often each mode chosen as best
- cpi->mode_chosen_counts[mb_mode_index]++;
-
- cpi->prediction_error += ctx->distortion;
- cpi->intra_error += ctx->intra_error;
-
- cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
- cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
- cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
- }
-}
-
-static void pick_mb_modes(VP9_COMP *cpi,
- VP9_COMMON *cm,
- int mb_row,
- int mb_col,
- MACROBLOCK *x,
- MACROBLOCKD *xd,
- TOKENEXTRA **tp,
- int *totalrate,
- int *totaldist) {
- int i;
- int map_index;
- int recon_yoffset, recon_uvoffset;
- int ref_fb_idx = cm->lst_fb_idx;
- int dst_fb_idx = cm->new_fb_idx;
- int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
- int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
- ENTROPY_CONTEXT_PLANES left_context[2];
- ENTROPY_CONTEXT_PLANES above_context[2];
- ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
- + mb_col;
-
- // Offsets to move pointers from MB to MB within a SB in raster order
- int row_delta[4] = { 0, +1, 0, -1};
- int col_delta[4] = { +1, -1, +1, +1};
-
- /* Function should not modify L & A contexts; save and restore on exit */
- vpx_memcpy(left_context,
- cm->left_context,
- sizeof(left_context));
- vpx_memcpy(above_context,
- initial_above_context_ptr,
- sizeof(above_context));
-
- /* Encode MBs in raster order within the SB */
- for (i = 0; i < 4; i++) {
- int dy = row_delta[i];
- int dx = col_delta[i];
- int offset_unextended = dy * cm->mb_cols + dx;
- int offset_extended = dy * xd->mode_info_stride + dx;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
- // TODO Many of the index items here can be computed more efficiently!
-
- if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
- // MB lies outside frame, move on
- mb_row += dy;
- mb_col += dx;
-
- // Update pointers
- x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
- x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
- x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
-
- x->gf_active_ptr += offset_unextended;
- x->partition_info += offset_extended;
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
-#if CONFIG_DEBUG
- assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
- (xd->mode_info_context - cpi->common.mip));
-#endif
- continue;
- }
-
- // Index of the MB in the SB 0..3
- xd->mb_index = i;
-
- map_index = (mb_row * cpi->common.mb_cols) + mb_col;
- x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
- // set above context pointer
- xd->above_context = cm->above_context + mb_col;
-
- // Restore the appropriate left context depending on which
- // row in the SB the MB is situated
- xd->left_context = cm->left_context + (i >> 1);
-
- // Set up distance of MB to edge of frame in 1/8th pel units
- xd->mb_to_top_edge = -((mb_row * 16) << 3);
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
- xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
- // Set up limit values for MV components to prevent them from
- // extending beyond the UMV borders assuming 16x16 block size
- x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
- (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
- x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
- (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
- xd->up_available = (mb_row != 0);
- xd->left_available = (mb_col != 0);
-
- recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
- xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
- xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
- xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
- // Copy current MB to a work buffer
- vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
- x->rddiv = cpi->RDDIV;
- x->rdmult = cpi->RDMULT;
-
- if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
- vp9_activity_masking(cpi, x);
-
- // Is segmentation enabled
- if (xd->segmentation_enabled) {
- // Code to set segment id in xd->mbmi.segment_id
- if (xd->update_mb_segmentation_map)
- mbmi->segment_id = cpi->segmentation_map[map_index];
- else
- mbmi->segment_id = cm->last_frame_seg_map[map_index];
- if (mbmi->segment_id > 3)
- mbmi->segment_id = 0;
-
- vp9_mb_init_quantizer(cpi, x);
- } else
- // Set to Segment 0 by default
- mbmi->segment_id = 0;
-
- x->active_ptr = cpi->active_map + map_index;
-
-#if CONFIG_SUPERBLOCKS
- xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
-
- cpi->update_context = 0; // TODO Do we need this now??
-
- vp9_intra_prediction_down_copy(xd);
-
- // Find best coding mode & reconstruct the MB so it is available
- // as a predictor for MBs that follow in the SB
- if (cm->frame_type == KEY_FRAME) {
- int r, d;
- vp9_rd_pick_intra_mode(cpi, x, &r, &d);
- *totalrate += r;
- *totaldist += d;
-
- // Dummy encode, do not do the tokenization
- vp9_encode_intra_macro_block(cpi, x, tp, 0);
- // Note the encoder may have changed the segment_id
-
- // Save the coding context
- vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
- sizeof(MODE_INFO));
- } else {
- int seg_id, r, d;
-
- if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
- !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
- vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
- vp9_check_segref(xd, 1, INTRA_FRAME) +
- vp9_check_segref(xd, 1, LAST_FRAME) +
- vp9_check_segref(xd, 1, GOLDEN_FRAME) +
- vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
- cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
- } else {
- cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
- }
-
- vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
- recon_uvoffset, &r, &d);
- *totalrate += r;
- *totaldist += d;
-
- // Dummy encode, do not do the tokenization
- vp9_encode_inter_macroblock(cpi, x, tp,
- recon_yoffset, recon_uvoffset, 0);
-
- seg_id = mbmi->segment_id;
- if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
- cpi->seg0_idx++;
- }
- if (!xd->segmentation_enabled ||
- !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||
- vp9_check_segref(xd, seg_id, INTRA_FRAME) +
- vp9_check_segref(xd, seg_id, LAST_FRAME) +
- vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +
- vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {
- // Get the prediction context and status
- int pred_flag = vp9_get_pred_flag(xd, PRED_REF);
- int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);
-
- // Count prediction success
- cpi->ref_pred_count[pred_context][pred_flag]++;
- }
- }
-
- // Next MB
- mb_row += dy;
- mb_col += dx;
-
- x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
- x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
- x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
-
- x->gf_active_ptr += offset_unextended;
- x->partition_info += offset_extended;
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
- assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
- (xd->mode_info_context - cpi->common.mip));
-#endif
- }
-
- /* Restore L & A coding context to those in place on entry */
- vpx_memcpy(cm->left_context,
- left_context,
- sizeof(left_context));
- vpx_memcpy(initial_above_context_ptr,
- above_context,
- sizeof(above_context));
-}
-
-#if CONFIG_SUPERBLOCKS
-static void pick_sb_modes (VP9_COMP *cpi,
- VP9_COMMON *cm,
- int mb_row,
- int mb_col,
- MACROBLOCK *x,
- MACROBLOCKD *xd,
- TOKENEXTRA **tp,
- int *totalrate,
- int *totaldist)
-{
- int map_index;
- int recon_yoffset, recon_uvoffset;
- int ref_fb_idx = cm->lst_fb_idx;
- int dst_fb_idx = cm->new_fb_idx;
- int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
- int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
- ENTROPY_CONTEXT_PLANES left_context[2];
- ENTROPY_CONTEXT_PLANES above_context[2];
- ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
- + mb_col;
-
- /* Function should not modify L & A contexts; save and restore on exit */
- vpx_memcpy (left_context,
- cm->left_context,
- sizeof(left_context));
- vpx_memcpy (above_context,
- initial_above_context_ptr,
- sizeof(above_context));
-
- map_index = (mb_row * cpi->common.mb_cols) + mb_col;
- x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
- /* set above context pointer */
- xd->above_context = cm->above_context + mb_col;
-
- /* Restore the appropriate left context depending on which
- * row in the SB the MB is situated */
- xd->left_context = cm->left_context;
-
- // Set up distance of MB to edge of frame in 1/8th pel units
- xd->mb_to_top_edge = -((mb_row * 16) << 3);
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
- xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
- /* Set up limit values for MV components to prevent them from
- * extending beyond the UMV borders assuming 16x16 block size */
- x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
- (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
- x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
- (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-
- xd->up_available = (mb_row != 0);
- xd->left_available = (mb_col != 0);
-
- recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
- xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
- xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
- xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-#if 0 // FIXME
- /* Copy current MB to a work buffer */
- vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
- x->rddiv = cpi->RDDIV;
- x->rdmult = cpi->RDMULT;
- if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
- vp9_activity_masking(cpi, x);
- /* Is segmentation enabled */
- if (xd->segmentation_enabled)
- {
- /* Code to set segment id in xd->mbmi.segment_id */
- if (xd->update_mb_segmentation_map)
- xd->mode_info_context->mbmi.segment_id =
- cpi->segmentation_map[map_index] &&
- cpi->segmentation_map[map_index + 1] &&
- cpi->segmentation_map[map_index + cm->mb_cols] &&
- cpi->segmentation_map[map_index + cm->mb_cols + 1];
- else
- xd->mode_info_context->mbmi.segment_id =
- cm->last_frame_seg_map[map_index] &&
- cm->last_frame_seg_map[map_index + 1] &&
- cm->last_frame_seg_map[map_index + cm->mb_cols] &&
- cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
- if (xd->mode_info_context->mbmi.segment_id > 3)
- xd->mode_info_context->mbmi.segment_id = 0;
-
- vp9_mb_init_quantizer(cpi, x);
- }
- else
- /* Set to Segment 0 by default */
- xd->mode_info_context->mbmi.segment_id = 0;
-
- x->active_ptr = cpi->active_map + map_index;
-
- cpi->update_context = 0; // TODO Do we need this now??
-
- /* Find best coding mode & reconstruct the MB so it is available
- * as a predictor for MBs that follow in the SB */
- if (cm->frame_type == KEY_FRAME)
- {
- vp9_rd_pick_intra_mode_sb(cpi, x,
- totalrate,
- totaldist);
-
- /* Save the coding context */
- vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
- sizeof(MODE_INFO));
- } else {
- if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
- !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
- vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
- vp9_check_segref(xd, 1, INTRA_FRAME) +
- vp9_check_segref(xd, 1, LAST_FRAME) +
- vp9_check_segref(xd, 1, GOLDEN_FRAME) +
- vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
- cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
- } else {
- cpi->seg0_progress =
- (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
- }
-
- vp9_rd_pick_inter_mode_sb(cpi, x,
- recon_yoffset,
- recon_uvoffset,
- totalrate,
- totaldist);
- }
-
- /* Restore L & A coding context to those in place on entry */
- vpx_memcpy (cm->left_context,
- left_context,
- sizeof(left_context));
- vpx_memcpy (initial_above_context_ptr,
- above_context,
- sizeof(above_context));
-}
-#endif
-
-static void encode_sb(VP9_COMP *cpi,
- VP9_COMMON *cm,
- int mbrow,
- int mbcol,
- MACROBLOCK *x,
- MACROBLOCKD *xd,
- TOKENEXTRA **tp) {
- int i;
- int map_index;
- int mb_row, mb_col;
- int recon_yoffset, recon_uvoffset;
- int ref_fb_idx = cm->lst_fb_idx;
- int dst_fb_idx = cm->new_fb_idx;
- int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
- int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
- int row_delta[4] = { 0, +1, 0, -1};
- int col_delta[4] = { +1, -1, +1, +1};
-
- mb_row = mbrow;
- mb_col = mbcol;
-
- /* Encode MBs in raster order within the SB */
- for (i = 0; i < 4; i++) {
- int dy = row_delta[i];
- int dx = col_delta[i];
- int offset_extended = dy * xd->mode_info_stride + dx;
- int offset_unextended = dy * cm->mb_cols + dx;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
- if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
- // MB lies outside frame, move on
- mb_row += dy;
- mb_col += dx;
-
- x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
- x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
- x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
-
- x->gf_active_ptr += offset_unextended;
- x->partition_info += offset_extended;
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
- assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
- (xd->mode_info_context - cpi->common.mip));
-#endif
- continue;
- }
-
- xd->mb_index = i;
-
-#ifdef ENC_DEBUG
- enc_debug = (cpi->common.current_video_frame == 0 &&
- mb_row == 0 && mb_col == 0);
- mb_col_debug = mb_col;
- mb_row_debug = mb_row;
-#endif
-
- // Restore MB state to that when it was picked
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- update_state(cpi, x, &x->sb_context[i]);
- cpi->sb_count++;
- } else
-#endif
- update_state(cpi, x, &x->mb_context[i]);
-
- map_index = (mb_row * cpi->common.mb_cols) + mb_col;
- x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
- // reset above block coeffs
- xd->above_context = cm->above_context + mb_col;
- xd->left_context = cm->left_context + (i >> 1);
-
- // Set up distance of MB to edge of the frame in 1/8th pel units
- xd->mb_to_top_edge = -((mb_row * 16) << 3);
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
- xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- // Set up limit values for MV components to prevent them from
- // extending beyond the UMV borders assuming 32x32 block size
- x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
- (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
- x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
- (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
- } else {
-#endif
- // Set up limit values for MV components to prevent them from
- // extending beyond the UMV borders assuming 16x16 block size
- x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
- x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
- (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
- x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
- (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-#if CONFIG_SUPERBLOCKS
- }
-#endif
-
- xd->up_available = (mb_row != 0);
- xd->left_available = (mb_col != 0);
-
- recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
- xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
- xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
- xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
- // Copy current MB to a work buffer
- vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
- if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
- vp9_activity_masking(cpi, x);
-
- // Is segmentation enabled
- if (xd->segmentation_enabled) {
- vp9_mb_init_quantizer(cpi, x);
- }
-
- x->active_ptr = cpi->active_map + map_index;
-
- cpi->update_context = 0;
-
-#if CONFIG_SUPERBLOCKS
- if (!xd->mode_info_context->mbmi.encoded_as_sb)
-#endif
- vp9_intra_prediction_down_copy(xd);
-
- if (cm->frame_type == KEY_FRAME) {
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb)
- vp9_encode_intra_super_block(cpi, x, tp, mb_col);
- else
-#endif
- vp9_encode_intra_macro_block(cpi, x, tp, 1);
- // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
- y_modes[mbmi->mode]++;
-#endif
- } else {
- unsigned char *segment_id;
- int seg_ref_active;
-
- if (xd->mode_info_context->mbmi.ref_frame) {
- unsigned char pred_context;
-
- pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
-
- if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
- cpi->single_pred_count[pred_context]++;
- else
- cpi->comp_pred_count[pred_context]++;
- }
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb)
- vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
- mb_col, mb_row);
- else
-#endif
- vp9_encode_inter_macroblock(cpi, x, tp,
- recon_yoffset, recon_uvoffset, 1);
- // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
- inter_y_modes[mbmi->mode]++;
-
- if (mbmi->mode == SPLITMV) {
- int b;
-
- for (b = 0; b < x->partition_info->count; b++) {
- inter_b_modes[x->partition_info->bmi[b].mode]++;
- }
- }
-
-#endif
-
- // If we have just a single reference frame coded for a segment then
- // exclude from the reference frame counts used to work out
- // probabilities. NOTE: At the moment we dont support custom trees
- // for the reference frame coding for each segment but this is a
- // possible future action.
- segment_id = &mbmi->segment_id;
- seg_ref_active = vp9_segfeature_active(xd, *segment_id,
- SEG_LVL_REF_FRAME);
- if (!seg_ref_active ||
- ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
- vp9_check_segref(xd, *segment_id, LAST_FRAME) +
- vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
- vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
- {
- cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
- }
- }
-
- // Count of last ref frame 0,0 usage
- if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
- cpi->inter_zz_count++;
- }
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- x->src.y_buffer += 32;
- x->src.u_buffer += 16;
- x->src.v_buffer += 16;
-
- x->gf_active_ptr += 2;
- x->partition_info += 2;
- xd->mode_info_context += 2;
- xd->prev_mode_info_context += 2;
-
- (*tp)->Token = EOSB_TOKEN;
- (*tp)++;
- if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
- break;
- }
-#endif
-
- // Next MB
- mb_row += dy;
- mb_col += dx;
-
- x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
- x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
- x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
-
- x->gf_active_ptr += offset_unextended;
- x->partition_info += offset_extended;
- xd->mode_info_context += offset_extended;
- xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
- assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
- (xd->mode_info_context - cpi->common.mip));
-#endif
- (*tp)->Token = EOSB_TOKEN;
- (*tp)++;
- if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
- }
-
- // debug output
-#if DBG_PRNT_SEGMAP
- {
- FILE *statsfile;
- statsfile = fopen("segmap2.stt", "a");
- fprintf(statsfile, "\n");
- fclose(statsfile);
- }
-#endif
-}
-
-static
-void encode_sb_row(VP9_COMP *cpi,
- VP9_COMMON *cm,
- int mb_row,
- MACROBLOCK *x,
- MACROBLOCKD *xd,
- TOKENEXTRA **tp,
- int *totalrate) {
- int mb_col;
- int mb_cols = cm->mb_cols;
-
- // Initialize the left context for the new SB row
- vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
-
- // Code each SB in the row
- for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
- int mb_rate = 0, mb_dist = 0;
-#if CONFIG_SUPERBLOCKS
- int sb_rate = INT_MAX, sb_dist;
-#endif
-
-#if CONFIG_DEBUG
- MODE_INFO *mic = xd->mode_info_context;
- PARTITION_INFO *pi = x->partition_info;
- signed char *gfa = x->gf_active_ptr;
- unsigned char *yb = x->src.y_buffer;
- unsigned char *ub = x->src.u_buffer;
- unsigned char *vb = x->src.v_buffer;
-#endif
-
-#if CONFIG_SUPERBLOCKS
- // Pick modes assuming the SB is coded as 4 independent MBs
- xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
- pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
-#if CONFIG_SUPERBLOCKS
- mb_rate += vp9_cost_bit(cm->sb_coded, 0);
-#endif
-
- x->src.y_buffer -= 32;
- x->src.u_buffer -= 16;
- x->src.v_buffer -= 16;
-
- x->gf_active_ptr -= 2;
- x->partition_info -= 2;
- xd->mode_info_context -= 2;
- xd->prev_mode_info_context -= 2;
-
-#if CONFIG_DEBUG
- assert(x->gf_active_ptr == gfa);
- assert(x->partition_info == pi);
- assert(xd->mode_info_context == mic);
- assert(x->src.y_buffer == yb);
- assert(x->src.u_buffer == ub);
- assert(x->src.v_buffer == vb);
-#endif
-
-#if CONFIG_SUPERBLOCKS
- if (!((( mb_cols & 1) && mb_col == mb_cols - 1) ||
- ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
- /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
- xd->mode_info_context->mbmi.encoded_as_sb = 1;
- pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
- sb_rate += vp9_cost_bit(cm->sb_coded, 1);
- }
-
- /* Decide whether to encode as a SB or 4xMBs */
- if (sb_rate < INT_MAX &&
- RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
- RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
- xd->mode_info_context->mbmi.encoded_as_sb = 1;
- xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
- xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
- xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
- *totalrate += sb_rate;
- } else
-#endif
- {
-#if CONFIG_SUPERBLOCKS
- xd->mode_info_context->mbmi.encoded_as_sb = 0;
- if (cm->mb_cols - 1 > mb_col)
- xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
- if (cm->mb_rows - 1 > mb_row) {
- xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
- if (cm->mb_cols - 1 > mb_col)
- xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
- }
-#endif
- *totalrate += mb_rate;
- }
-
- /* Encode SB using best computed mode(s) */
- encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
-
-#if CONFIG_DEBUG
- assert(x->gf_active_ptr == gfa + 2);
- assert(x->partition_info == pi + 2);
- assert(xd->mode_info_context == mic + 2);
- assert(x->src.y_buffer == yb + 32);
- assert(x->src.u_buffer == ub + 16);
- assert(x->src.v_buffer == vb + 16);
-#endif
- }
-
- // this is to account for the border
- x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
- x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
- xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
- xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-
-#if CONFIG_DEBUG
- assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
- (xd->mode_info_context - cpi->common.mip));
-#endif
-}
-
-static void init_encode_frame_mb_context(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
-
- // GF active flags data structure
- x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
- // Activity map pointer
- x->mb_activity_ptr = cpi->mb_activity_map;
-
- x->act_zbin_adj = 0;
- cpi->seg0_idx = 0;
- vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
-
- x->partition_info = x->pi;
-
- xd->mode_info_context = cm->mi;
- xd->mode_info_stride = cm->mode_info_stride;
- xd->prev_mode_info_context = cm->prev_mi;
-
- xd->frame_type = cm->frame_type;
-
- xd->frames_since_golden = cm->frames_since_golden;
- xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
-
- // reset intra mode contexts
- if (cm->frame_type == KEY_FRAME)
- vp9_init_mbmode_probs(cm);
-
- // Copy data over into macro block data structures.
- x->src = * cpi->Source;
- xd->pre = cm->yv12_fb[cm->lst_fb_idx];
- xd->dst = cm->yv12_fb[cm->new_fb_idx];
-
- // set up frame for intra coded blocks
- vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd);
-
- vp9_setup_block_ptrs(x);
-
- xd->mode_info_context->mbmi.mode = DC_PRED;
- xd->mode_info_context->mbmi.uv_mode = DC_PRED;
-
- vp9_zero(cpi->count_mb_ref_frame_usage)
- vp9_zero(cpi->bmode_count)
- vp9_zero(cpi->ymode_count)
- vp9_zero(cpi->i8x8_mode_count)
- vp9_zero(cpi->y_uv_mode_count)
- vp9_zero(cpi->sub_mv_ref_count)
- vp9_zero(cpi->mbsplit_count)
- vp9_zero(cpi->common.fc.mv_ref_ct)
- vp9_zero(cpi->common.fc.mv_ref_ct_a)
-#if CONFIG_SUPERBLOCKS
- vp9_zero(cpi->sb_ymode_count)
- cpi->sb_count = 0;
-#endif
-
- vpx_memset(cm->above_context, 0,
- sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
-
- xd->fullpixel_mask = 0xffffffff;
- if (cm->full_pixel)
- xd->fullpixel_mask = 0xfffffff8;
-}
-
-static void encode_frame_internal(VP9_COMP *cpi) {
- int mb_row;
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
-
- TOKENEXTRA *tp = cpi->tok;
- int totalrate;
-
- //printf("encode_frame_internal\n");
-
- // Compute a modified set of reference frame probabilities to use when
- // prediction fails. These are based on the current general estimates for
- // this frame which may be updated with each iteration of the recode loop.
- vp9_compute_mod_refprobs(cm);
-
-#if CONFIG_NEW_MVREF
- // temp stats reset
- vp9_zero( cpi->best_ref_index_counts );
-#endif
-
-// debug output
-#if DBG_PRNT_SEGMAP
- {
- FILE *statsfile;
- statsfile = fopen("segmap2.stt", "a");
- fprintf(statsfile, "\n");
- fclose(statsfile);
- }
-#endif
-
- totalrate = 0;
-
- // Functions setup for all frame types so we can use MC in AltRef
- vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
-
- // Reset frame count of inter 0,0 motion vector usage.
- cpi->inter_zz_count = 0;
-
- cpi->prediction_error = 0;
- cpi->intra_error = 0;
- cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
- cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
-
-#if CONFIG_PRED_FILTER
- if (cm->current_video_frame == 0) {
- // Initially assume that we'll signal the prediction filter
- // state at the frame level and that it is off.
- cpi->common.pred_filter_mode = 0;
- cpi->common.prob_pred_filter_off = 128;
- }
- cpi->pred_filter_on_count = 0;
- cpi->pred_filter_off_count = 0;
-#endif
- vp9_zero(cpi->switchable_interp_count);
-
- xd->mode_info_context = cm->mi;
- xd->prev_mode_info_context = cm->prev_mi;
-
- vp9_zero(cpi->NMVcount);
- vp9_zero(cpi->coef_counts);
- vp9_zero(cpi->hybrid_coef_counts);
- vp9_zero(cpi->coef_counts_8x8);
- vp9_zero(cpi->hybrid_coef_counts_8x8);
- vp9_zero(cpi->coef_counts_16x16);
- vp9_zero(cpi->hybrid_coef_counts_16x16);
-
- vp9_frame_init_quantizer(cpi);
-
- vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
- vp9_initialize_me_consts(cpi, cm->base_qindex);
-
- if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
- // Initialize encode frame context.
- init_encode_frame_mb_context(cpi);
-
- // Build a frame level activity map
- build_activity_map(cpi);
- }
-
- // re-initencode frame context.
- init_encode_frame_mb_context(cpi);
-
- vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
- vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
- vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
- vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
- vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
- vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
- {
- struct vpx_usec_timer emr_timer;
- vpx_usec_timer_start(&emr_timer);
-
- {
- // For each row of SBs in the frame
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
- int offset = (cm->mb_cols + 1) & ~0x1;
-
- encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
-
- // adjust to the next row of SBs
- x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
- x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
- x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
- }
-
- cpi->tok_count = tp - cpi->tok;
- }
-
- vpx_usec_timer_mark(&emr_timer);
- cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
-
- }
-
- // 256 rate units to the bit,
- // projected_frame_size in units of BYTES
- cpi->projected_frame_size = totalrate >> 8;
-
-
-#if 0
- // Keep record of the total distortion this time around for future use
- cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-
-}
-
-static int check_dual_ref_flags(VP9_COMP *cpi) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- int ref_flags = cpi->ref_frame_flags;
-
- if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
- if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
- vp9_check_segref(xd, 1, LAST_FRAME))
- return 1;
- if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
- vp9_check_segref(xd, 1, GOLDEN_FRAME))
- return 1;
- if ((ref_flags & (VP9_ALT_FLAG | VP9_LAST_FLAG)) == (VP9_ALT_FLAG | VP9_LAST_FLAG) &&
- vp9_check_segref(xd, 1, ALTREF_FRAME))
- return 1;
- return 0;
- } else {
- return (!!(ref_flags & VP9_GOLD_FLAG) +
- !!(ref_flags & VP9_LAST_FLAG) +
- !!(ref_flags & VP9_ALT_FLAG)) >= 2;
- }
-}
-
-static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
- VP9_COMMON *cm = &cpi->common;
- int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
- MODE_INFO *mi, *mi_ptr = cm->mi;
-#if CONFIG_SUPERBLOCKS
- MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
- MB_MODE_INFO *sb_mbmi;
-#endif
- MB_MODE_INFO *mbmi;
- MACROBLOCK *x = &cpi->mb;
- MACROBLOCKD *xd = &x->e_mbd;
-
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
- mi = mi_ptr;
-#if CONFIG_SUPERBLOCKS
- sb_mi = sb_mi_ptr;
-#endif
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
- mbmi = &mi->mbmi;
-#if CONFIG_SUPERBLOCKS
- sb_mbmi = &sb_mi->mbmi;
-#endif
- if (
-#if CONFIG_SUPERBLOCKS
- !sb_mbmi->encoded_as_sb &&
-#endif
- mbmi->txfm_size > txfm_max) {
- segment_id = mbmi->segment_id;
- xd->mode_info_context = mi;
- assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
- (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
- mbmi->txfm_size = txfm_max;
- }
-#if CONFIG_SUPERBLOCKS
- if (mb_col & 1)
- sb_mi += 2;
-#endif
- }
-#if CONFIG_SUPERBLOCKS
- if (mb_row & 1)
- sb_mi_ptr += 2 * mis;
-#endif
- }
-}
-
-void vp9_encode_frame(VP9_COMP *cpi) {
- if (cpi->sf.RD) {
- int i, frame_type, pred_type;
- TXFM_MODE txfm_type;
-
- /*
- * This code does a single RD pass over the whole frame assuming
- * either compound, single or hybrid prediction as per whatever has
- * worked best for that type of frame in the past.
- * It also predicts whether another coding mode would have worked
- * better that this coding mode. If that is the case, it remembers
- * that for subsequent frames.
- * It does the same analysis for transform size selection also.
- */
- if (cpi->common.frame_type == KEY_FRAME)
- frame_type = 0;
- else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
- frame_type = 3;
- else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
- frame_type = 1;
- else
- frame_type = 2;
-
- /* prediction (compound, single or hybrid) mode selection */
- if (frame_type == 3)
- pred_type = SINGLE_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][0] &&
- cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][2] &&
- check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
- pred_type = COMP_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][0] >
- cpi->rd_prediction_type_threshes[frame_type][2])
- pred_type = SINGLE_PREDICTION_ONLY;
- else
- pred_type = HYBRID_PREDICTION;
-
- /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
-#if CONFIG_LOSSLESS
- if (cpi->oxcf.lossless) {
- txfm_type = ONLY_4X4;
- } else
-#endif
- /* FIXME (rbultje)
- * this is a hack (no really), basically to work around the complete
- * nonsense coefficient cost prediction for keyframes. The probabilities
- * are reset to defaults, and thus we basically have no idea how expensive
- * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
- * of the two is better is utterly bogus.
- * I'd like to eventually remove this hack, but in order to do that, we
- * need to move the frame reset code from the frame encode init to the
- * bitstream write code, or alternatively keep a backup of the previous
- * keyframe's probabilities as an estimate of what the current keyframe's
- * coefficient cost distributions may look like. */
- if (frame_type == 0) {
- txfm_type = ALLOW_16X16;
- } else
-#if 0
- /* FIXME (rbultje)
- * this code is disabled for a similar reason as the code above; the
- * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
- * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
- * thus leading to them lagging further behind and not being chosen for
- * subsequent frames either. This is essentially a local minimum problem
- * that we can probably fix by estimating real costs more closely within
- * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
- * progresses. */
- if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = TX_MODE_SELECT;
- } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
- && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
- ) {
- txfm_type = ONLY_4X4;
- } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = ALLOW_16X16;
- } else
- txfm_type = ALLOW_8X8;
-#else
- txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
- ALLOW_16X16 : TX_MODE_SELECT;
-#endif
- cpi->common.txfm_mode = txfm_type;
- if (txfm_type != TX_MODE_SELECT) {
- cpi->common.prob_tx[0] = 128;
- cpi->common.prob_tx[1] = 128;
- }
- cpi->common.comp_pred_mode = pred_type;
- encode_frame_internal(cpi);
-
- for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
- const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
- cpi->rd_prediction_type_threshes[frame_type][i] += diff;
- cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
- }
-
- for (i = 0; i < NB_TXFM_MODES; ++i) {
- int64_t pd = cpi->rd_tx_select_diff[i];
- int diff;
- if (i == TX_MODE_SELECT)
- pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
- diff = pd / cpi->common.MBs;
- cpi->rd_tx_select_threshes[frame_type][i] += diff;
- cpi->rd_tx_select_threshes[frame_type][i] /= 2;
- }
-
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
- int single_count_zero = 0;
- int comp_count_zero = 0;
-
- for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
- single_count_zero += cpi->single_pred_count[i];
- comp_count_zero += cpi->comp_pred_count[i];
- }
-
- if (comp_count_zero == 0) {
- cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
- } else if (single_count_zero == 0) {
- cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
- }
- }
-
- if (cpi->common.txfm_mode == TX_MODE_SELECT) {
- const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
- const int count8x8 = cpi->txfm_count[TX_8X8];
- const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
- const int count16x16 = cpi->txfm_count[TX_16X16];
-
- if (count4x4 == 0 && count16x16 == 0) {
- cpi->common.txfm_mode = ALLOW_8X8;
- reset_skip_txfm_size(cpi, TX_8X8);
- } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
- cpi->common.txfm_mode = ONLY_4X4;
- reset_skip_txfm_size(cpi, TX_4X4);
- } else if (count8x8 == 0 && count4x4 == 0) {
- cpi->common.txfm_mode = ALLOW_16X16;
- }
- }
- } else {
- encode_frame_internal(cpi);
- }
-
-}
-
-void vp9_setup_block_ptrs(MACROBLOCK *x) {
- int r, c;
- int i;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
- }
- }
-
- for (r = 0; r < 2; r++) {
- for (c = 0; c < 2; c++) {
- x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
- }
- }
-
-
- for (r = 0; r < 2; r++) {
- for (c = 0; c < 2; c++) {
- x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
- }
- }
-
- x->block[24].src_diff = x->src_diff + 384;
-
-
- for (i = 0; i < 25; i++) {
- x->block[i].coeff = x->coeff + i * 16;
- }
-}
-
-void vp9_build_block_offsets(MACROBLOCK *x) {
- int block = 0;
- int br, bc;
-
- vp9_build_block_doffsets(&x->e_mbd);
-
- // y blocks
- x->thismb_ptr = &x->thismb[0];
- for (br = 0; br < 4; br++) {
- for (bc = 0; bc < 4; bc++) {
- BLOCK *this_block = &x->block[block];
- // this_block->base_src = &x->src.y_buffer;
- // this_block->src_stride = x->src.y_stride;
- // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
- this_block->base_src = &x->thismb_ptr;
- this_block->src_stride = 16;
- this_block->src = 4 * br * 16 + 4 * bc;
- ++block;
- }
- }
-
- // u blocks
- for (br = 0; br < 2; br++) {
- for (bc = 0; bc < 2; bc++) {
- BLOCK *this_block = &x->block[block];
- this_block->base_src = &x->src.u_buffer;
- this_block->src_stride = x->src.uv_stride;
- this_block->src = 4 * br * this_block->src_stride + 4 * bc;
- ++block;
- }
- }
-
- // v blocks
- for (br = 0; br < 2; br++) {
- for (bc = 0; bc < 2; bc++) {
- BLOCK *this_block = &x->block[block];
- this_block->base_src = &x->src.v_buffer;
- this_block->src_stride = x->src.uv_stride;
- this_block->src = 4 * br * this_block->src_stride + 4 * bc;
- ++block;
- }
- }
-}
-
-static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
- const MACROBLOCKD *xd = &x->e_mbd;
- const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
- const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
-
-#ifdef MODE_STATS
- const int is_key = cpi->common.frame_type == KEY_FRAME;
-
- ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
- ++ uv_modes_y[m][uvm];
-
- if (m == B_PRED) {
- unsigned int *const bct = is_key ? b_modes : inter_b_modes;
-
- int b = 0;
-
- do {
- ++ bct[xd->block[b].bmi.as_mode.first];
- } while (++b < 16);
- }
-
- if (m == I8X8_PRED) {
- i8x8_modes[xd->block[0].bmi.as_mode.first]++;
- i8x8_modes[xd->block[2].bmi.as_mode.first]++;
- i8x8_modes[xd->block[8].bmi.as_mode.first]++;
- i8x8_modes[xd->block[10].bmi.as_mode.first]++;
- }
-#endif
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- ++cpi->sb_ymode_count[m];
- } else
-#endif
- ++cpi->ymode_count[m];
- if (m != I8X8_PRED)
- ++cpi->y_uv_mode_count[m][uvm];
- else {
- cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
- cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
- cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
- cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
- }
- if (m == B_PRED) {
- int b = 0;
- do {
- ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];
- } while (++b < 16);
- }
-}
-
-// Experimental stub function to create a per MB zbin adjustment based on
-// some previously calculated measure of MB activity.
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
- x->act_zbin_adj = *(x->mb_activity_ptr);
-#else
- int64_t a;
- int64_t b;
- int64_t act = *(x->mb_activity_ptr);
-
- // Apply the masking to the RD multiplier.
- a = act + 4 * cpi->activity_avg;
- b = 4 * act + cpi->activity_avg;
-
- if (act > cpi->activity_avg)
- x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
- else
- x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
-#endif
-}
-
-#if CONFIG_SUPERBLOCKS
-static void update_sb_skip_coeff_state(VP9_COMP *cpi,
- MACROBLOCK *x,
- ENTROPY_CONTEXT_PLANES ta[4],
- ENTROPY_CONTEXT_PLANES tl[4],
- TOKENEXTRA *t[4],
- TOKENEXTRA **tp,
- int skip[4])
-{
- TOKENEXTRA tokens[4][16 * 24];
- int n_tokens[4], n;
-
- // if there were no skips, we don't need to do anything
- if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
- return;
-
- // if we don't do coeff skipping for this frame, we don't
- // need to do anything here
- if (!cpi->common.mb_no_coeff_skip)
- return;
-
- // if all 4 MBs skipped coeff coding, nothing to be done
- if (skip[0] && skip[1] && skip[2] && skip[3])
- return;
-
- // so the situation now is that we want to skip coeffs
- // for some MBs, but not all, and we didn't code EOB
- // coefficients for them. However, the skip flag for this
- // SB will be 0 overall, so we need to insert EOBs in the
- // middle of the token tree. Do so here.
- n_tokens[0] = t[1] - t[0];
- n_tokens[1] = t[2] - t[1];
- n_tokens[2] = t[3] - t[2];
- n_tokens[3] = *tp - t[3];
- if (n_tokens[0])
- memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
- if (n_tokens[1])
- memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
- if (n_tokens[2])
- memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
- if (n_tokens[3])
- memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
-
- // reset pointer, stuff EOBs where necessary
- *tp = t[0];
- for (n = 0; n < 4; n++) {
- if (skip[n]) {
- x->e_mbd.above_context = &ta[n];
- x->e_mbd.left_context = &tl[n];
- vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
- } else {
- if (n_tokens[n]) {
- memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
- }
- (*tp) += n_tokens[n];
- }
- }
-}
-
-void vp9_encode_intra_super_block(VP9_COMP *cpi,
- MACROBLOCK *x,
- TOKENEXTRA **t,
- int mb_col) {
- const int output_enabled = 1;
- int n;
- MACROBLOCKD *xd = &x->e_mbd;
- VP9_COMMON *cm = &cpi->common;
- const uint8_t *src = x->src.y_buffer;
- uint8_t *dst = xd->dst.y_buffer;
- const uint8_t *usrc = x->src.u_buffer;
- uint8_t *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer;
- uint8_t *vdst = xd->dst.v_buffer;
- int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
- int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
- const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
- TOKENEXTRA *tp[4];
- int skip[4];
- MODE_INFO *mi = x->e_mbd.mode_info_context;
- ENTROPY_CONTEXT_PLANES ta[4], tl[4];
-
- if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
- adjust_act_zbin(cpi, x);
- vp9_update_zbin_extra(cpi, x);
- }
-
- vp9_build_intra_predictors_sby_s(&x->e_mbd);
- vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-
- assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
- for (n = 0; n < 4; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
-
- xd->above_context = cm->above_context + mb_col + (n & 1);
- xd->left_context = cm->left_context + (n >> 1);
-
- vp9_subtract_mby_s_c(x->src_diff,
- src + x_idx * 16 + y_idx * 16 * src_y_stride,
- src_y_stride,
- dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
- dst_y_stride);
- vp9_subtract_mbuv_s_c(x->src_diff,
- usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- src_uv_stride,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- dst_uv_stride);
- vp9_transform_mb_8x8(x);
- vp9_quantize_mb_8x8(x);
- if (x->optimize) {
- vp9_optimize_mby_8x8(x, rtcd);
- vp9_optimize_mbuv_8x8(x, rtcd);
- }
- vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
- vp9_recon_mbuv_s_c(&x->e_mbd,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
- if (output_enabled) {
- memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
- memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
- tp[n] = *t;
- xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
- vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
- skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
- }
- }
-
- if (output_enabled) {
- // Tokenize
- xd->mode_info_context = mi;
- sum_intra_stats(cpi, x);
- update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
- }
-}
-#endif /* CONFIG_SUPERBLOCKS */
-
-void vp9_encode_intra_macro_block(VP9_COMP *cpi,
- MACROBLOCK *x,
- TOKENEXTRA **t,
- int output_enabled) {
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
- adjust_act_zbin(cpi, x);
- vp9_update_zbin_extra(cpi, x);
- }
- if (mbmi->mode == I8X8_PRED) {
- vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
- vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
- } else if (mbmi->mode == B_PRED) {
- vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
- } else {
- vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
- }
-
- if (mbmi->mode != I8X8_PRED) {
- vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
- }
-
- if (output_enabled) {
- int segment_id = mbmi->segment_id;
-
- // Tokenize
- sum_intra_stats(cpi, x);
- vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
-
- if (cpi->common.txfm_mode == TX_MODE_SELECT &&
- !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
- (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
- vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
- if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {
- cpi->txfm_count[mbmi->txfm_size]++;
- } else if (mbmi->mode == I8X8_PRED) {
- cpi->txfm_count_8x8p[mbmi->txfm_size]++;
- }
- } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {
- mbmi->txfm_size = TX_16X16;
- } else
- if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
- }
-#if CONFIG_NEWBESTREFMV
- else
- vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);
-#endif
-}
-
-extern void vp9_fix_contexts(MACROBLOCKD *xd);
-
-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t, int recon_yoffset,
- int recon_uvoffset, int output_enabled) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
- unsigned char *segment_id = &mbmi->segment_id;
- int seg_ref_active;
- unsigned char ref_pred_flag;
-
- x->skip = 0;
-#if CONFIG_SUPERBLOCKS
- assert(!xd->mode_info_context->mbmi.encoded_as_sb);
-#endif
-
- vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
- if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
- // Adjust the zbin based on this MB rate.
- adjust_act_zbin(cpi, x);
- }
-
- {
- // Experimental code. Special case for gf and arf zeromv modes.
- // Increase zbin size to suppress noise
- cpi->zbin_mode_boost = 0;
- if (cpi->zbin_mode_boost_enabled) {
- if (mbmi->ref_frame != INTRA_FRAME) {
- if (mbmi->mode == ZEROMV) {
- if (mbmi->ref_frame != LAST_FRAME)
- cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
- else
- cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
- } else if (mbmi->mode == SPLITMV)
- cpi->zbin_mode_boost = 0;
- else
- cpi->zbin_mode_boost = MV_ZBIN_BOOST;
- }
- }
-
- vp9_update_zbin_extra(cpi, x);
- }
-
- seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
-
- // SET VARIOUS PREDICTION FLAGS
-
- // Did the chosen reference frame match its predicted value.
- ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));
- vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-
- if (mbmi->ref_frame == INTRA_FRAME) {
- if (mbmi->mode == B_PRED) {
- vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
- vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
- } else if (mbmi->mode == I8X8_PRED) {
- vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
- vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
- } else {
- vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
- vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
- }
-
- if (output_enabled)
- sum_intra_stats(cpi, x);
- } else {
- int ref_fb_idx;
-
- if (mbmi->ref_frame == LAST_FRAME)
- ref_fb_idx = cpi->common.lst_fb_idx;
- else if (mbmi->ref_frame == GOLDEN_FRAME)
- ref_fb_idx = cpi->common.gld_fb_idx;
- else
- ref_fb_idx = cpi->common.alt_fb_idx;
-
- xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
- xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
- xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
- if (mbmi->second_ref_frame) {
- int second_ref_fb_idx;
-
- if (mbmi->second_ref_frame == LAST_FRAME)
- second_ref_fb_idx = cpi->common.lst_fb_idx;
- else if (mbmi->second_ref_frame == GOLDEN_FRAME)
- second_ref_fb_idx = cpi->common.gld_fb_idx;
- else
- second_ref_fb_idx = cpi->common.alt_fb_idx;
-
- xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
- recon_yoffset;
- xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
- recon_uvoffset;
- xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
- recon_uvoffset;
- }
-
- if (!x->skip) {
- vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
-
- // Clear mb_skip_coeff if mb_no_coeff_skip is not set
- if (!cpi->common.mb_no_coeff_skip)
- mbmi->mb_skip_coeff = 0;
-
- } else {
- vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride,
- xd->dst.uv_stride);
- }
- }
-
- if (!x->skip) {
-#ifdef ENC_DEBUG
- if (enc_debug) {
- int i;
- printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,
- mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);
- for (i = 0; i < 400; i++) {
- printf("%3d ", xd->qcoeff[i]);
- if (i % 16 == 15) printf("\n");
- }
- printf("\n");
- printf("eobs = ");
- for (i = 0; i < 25; i++)
- printf("%d:%d ", i, xd->block[i].eob);
- printf("\n");
- fflush(stdout);
- }
-#endif
-
- vp9_tokenize_mb(cpi, xd, t, !output_enabled);
-
-#ifdef ENC_DEBUG
- if (enc_debug) {
- printf("Tokenized\n");
- fflush(stdout);
- }
-#endif
- } else {
- int mb_skip_context =
- cpi->common.mb_no_coeff_skip ?
- (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
- (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
- 0;
- if (cpi->common.mb_no_coeff_skip) {
- mbmi->mb_skip_coeff = 1;
- if (output_enabled)
- cpi->skip_true_count[mb_skip_context]++;
- vp9_fix_contexts(xd);
- } else {
- vp9_stuff_mb(cpi, xd, t, !output_enabled);
- mbmi->mb_skip_coeff = 0;
- if (output_enabled)
- cpi->skip_false_count[mb_skip_context]++;
- }
- }
-
- if (output_enabled) {
- int segment_id = mbmi->segment_id;
- if (cpi->common.txfm_mode == TX_MODE_SELECT &&
- !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
- (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
- vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
- if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
- mbmi->mode != SPLITMV) {
- cpi->txfm_count[mbmi->txfm_size]++;
- } else if (mbmi->mode == I8X8_PRED ||
- (mbmi->mode == SPLITMV &&
- mbmi->partitioning != PARTITIONING_4X4)) {
- cpi->txfm_count_8x8p[mbmi->txfm_size]++;
- }
- } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
- mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
- mbmi->txfm_size = TX_16X16;
- } else if (mbmi->mode != B_PRED &&
- !(mbmi->mode == SPLITMV &&
- mbmi->partitioning == PARTITIONING_4X4) &&
- cpi->common.txfm_mode >= ALLOW_8X8) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
- }
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
- int recon_yoffset, int recon_uvoffset,
- int mb_col, int mb_row) {
- const int output_enabled = 1;
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- const uint8_t *src = x->src.y_buffer;
- uint8_t *dst = xd->dst.y_buffer;
- const uint8_t *usrc = x->src.u_buffer;
- uint8_t *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer;
- uint8_t *vdst = xd->dst.v_buffer;
- int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
- int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
- const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
- unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
- int seg_ref_active;
- unsigned char ref_pred_flag;
- int n;
- TOKENEXTRA *tp[4];
- int skip[4];
- MODE_INFO *mi = x->e_mbd.mode_info_context;
- ENTROPY_CONTEXT_PLANES ta[4], tl[4];
-
- x->skip = 0;
-
- if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
- // Adjust the zbin based on this MB rate.
- adjust_act_zbin(cpi, x);
- }
-
- {
- // Experimental code. Special case for gf and arf zeromv modes.
- // Increase zbin size to suppress noise
- cpi->zbin_mode_boost = 0;
- if (cpi->zbin_mode_boost_enabled) {
- if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
- if (xd->mode_info_context->mbmi.mode == ZEROMV) {
- if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
- cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
- else
- cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
- } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
- cpi->zbin_mode_boost = 0;
- else
- cpi->zbin_mode_boost = MV_ZBIN_BOOST;
- }
- }
-
- vp9_update_zbin_extra(cpi, x);
- }
-
- seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
- // SET VARIOUS PREDICTION FLAGS
-
- // Did the chosen reference frame match its predicted value.
- ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
- vp9_get_pred_ref(cm, xd)));
- vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-
- if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
- vp9_build_intra_predictors_sby_s(&x->e_mbd);
- vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
- } else {
- int ref_fb_idx;
-
- if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
- ref_fb_idx = cpi->common.lst_fb_idx;
- else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
- ref_fb_idx = cpi->common.gld_fb_idx;
- else
- ref_fb_idx = cpi->common.alt_fb_idx;
-
- xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
- xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
- xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
-
- if (xd->mode_info_context->mbmi.second_ref_frame) {
- int second_ref_fb_idx;
-
- if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
- second_ref_fb_idx = cpi->common.lst_fb_idx;
- else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
- second_ref_fb_idx = cpi->common.gld_fb_idx;
- else
- second_ref_fb_idx = cpi->common.alt_fb_idx;
-
- xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
- recon_yoffset;
- xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
- recon_uvoffset;
- xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
- recon_uvoffset;
- }
-
- vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
- xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.y_stride, xd->dst.uv_stride);
- }
-
- assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
- for (n = 0; n < 4; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
-
- vp9_subtract_mby_s_c(x->src_diff,
- src + x_idx * 16 + y_idx * 16 * src_y_stride,
- src_y_stride,
- dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
- dst_y_stride);
- vp9_subtract_mbuv_s_c(x->src_diff,
- usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- src_uv_stride,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- dst_uv_stride);
- vp9_transform_mb_8x8(x);
- vp9_quantize_mb_8x8(x);
- if (x->optimize) {
- vp9_optimize_mby_8x8(x, rtcd);
- vp9_optimize_mbuv_8x8(x, rtcd);
- }
- vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- vp9_recon_mby_s_c(&x->e_mbd,
- dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
- vp9_recon_mbuv_s_c(&x->e_mbd,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
- if (!x->skip) {
- if (output_enabled) {
- xd->left_context = cm->left_context + (n >> 1);
- xd->above_context = cm->above_context + mb_col + (n & 1);
- memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
- memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
- tp[n] = *t;
- xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
- vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
- skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
- }
- } else {
- int mb_skip_context =
- cpi->common.mb_no_coeff_skip ?
- (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
- (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
- 0;
- if (cpi->common.mb_no_coeff_skip) {
- skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
- xd->left_context = cm->left_context + (n >> 1);
- xd->above_context = cm->above_context + mb_col + (n & 1);
- memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
- memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
- tp[n] = *t;
- cpi->skip_true_count[mb_skip_context]++;
- vp9_fix_contexts(xd);
- } else {
- vp9_stuff_mb(cpi, xd, t, 0);
- xd->mode_info_context->mbmi.mb_skip_coeff = 0;
- cpi->skip_false_count[mb_skip_context]++;
- }
- }
- }
-
- xd->mode_info_context = mi;
- update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
-}
-#endif
--- a/vp8/encoder/encodeintra.c
+++ /dev/null
@@ -1,289 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "vpx_rtcd.h"
-#include "vp8/common/idct.h"
-#include "quantize.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "encodemb.h"
-#include "vp8/common/invtrans.h"
-#include "encodeintra.h"
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
- int i;
- int intra_pred_var = 0;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- (void) cpi;
-
- if (use_16x16_pred) {
- mbmi->mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
- mbmi->uv_mode = DC_PRED;
- mbmi->ref_frame = INTRA_FRAME;
-
- vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
- } else {
- for (i = 0; i < 16; i++) {
- x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
- vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
- }
- }
-
- intra_pred_var = vp9_get_mb_ss(x->src_diff);
-
- return intra_pred_var;
-}
-
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
- MACROBLOCK *x, int ib) {
- BLOCKD *b = &x->e_mbd.block[ib];
- BLOCK *be = &x->block[ib];
- TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
- if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
- vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
- b->predictor);
- }
-#endif
-
- vp9_subtract_b(be, b, 16);
-
- tx_type = get_tx_type(&x->e_mbd, b);
- if (tx_type != DCT_DCT) {
- vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
- vp9_ht_quantize_b_4x4(be, b, tx_type);
- vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
- } else {
- x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
- x->quantize_b_4x4(be, b) ;
- vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
- }
-
- vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
- int i;
-
- for (i = 0; i < 16; i++)
- vp9_encode_intra4x4block(rtcd, mb, i);
- return;
-}
-
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- MACROBLOCKD *xd = &x->e_mbd;
- BLOCK *b = &x->block[0];
- TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
- TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
- if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))
-#endif
- vp9_build_intra_predictors_mby(xd);
-#if CONFIG_COMP_INTRA_PRED
- else
- vp9_build_comp_intra_predictors_mby(xd);
-#endif
-
- vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
- if (tx_size == TX_16X16) {
- BLOCKD *bd = &xd->block[0];
- tx_type = get_tx_type(xd, bd);
- if (tx_type != DCT_DCT) {
- vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);
- vp9_quantize_mby_16x16(x);
- if (x->optimize)
- vp9_optimize_mby_16x16(x, rtcd);
- vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
- } else {
- vp9_transform_mby_16x16(x);
- vp9_quantize_mby_16x16(x);
- if (x->optimize)
- vp9_optimize_mby_16x16(x, rtcd);
- vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);
- }
- } else if (tx_size == TX_8X8) {
- vp9_transform_mby_8x8(x);
- vp9_quantize_mby_8x8(x);
- if (x->optimize)
- vp9_optimize_mby_8x8(x, rtcd);
- vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
- } else {
- vp9_transform_mby_4x4(x);
- vp9_quantize_mby_4x4(x);
- if (x->optimize)
- vp9_optimize_mby_4x4(x, rtcd);
- vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
- }
-
- vp9_recon_mby(xd);
-}
-
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- MACROBLOCKD *xd = &x->e_mbd;
- TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-#if CONFIG_COMP_INTRA_PRED
- if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
- vp9_build_intra_predictors_mbuv(xd);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_build_comp_intra_predictors_mbuv(xd);
- }
-#endif
-
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- xd->predictor, x->src.uv_stride);
-
- if (tx_size == TX_4X4) {
- vp9_transform_mbuv_4x4(x);
- vp9_quantize_mbuv_4x4(x);
- if (x->optimize)
- vp9_optimize_mbuv_4x4(x, rtcd);
- vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
- } else /* 16x16 or 8x8 */ {
- vp9_transform_mbuv_8x8(x);
- vp9_quantize_mbuv_8x8(x);
- if (x->optimize)
- vp9_optimize_mbuv_8x8(x, rtcd);
- vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);
- }
-
- vp9_recon_intra_mbuv(xd);
-}
-
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
- MACROBLOCK *x, int ib) {
- MACROBLOCKD *xd = &x->e_mbd;
- BLOCKD *b = &xd->block[ib];
- BLOCK *be = &x->block[ib];
- const int iblock[4] = {0, 1, 4, 5};
- int i;
- TX_TYPE tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
- if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
- vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
- b->predictor);
- }
-#endif
-
- if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
- int idx = (ib & 0x02) ? (ib + 2) : ib;
-
- // generate residual blocks
- vp9_subtract_4b_c(be, b, 16);
-
- tx_type = get_tx_type(xd, xd->block + idx);
- if (tx_type != DCT_DCT) {
- vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,
- tx_type, 8);
- x->quantize_b_8x8(x->block + idx, xd->block + idx);
- vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
- tx_type, 8);
- } else {
- x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
- x->quantize_b_8x8(x->block + idx, xd->block + idx);
- vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
- }
- } else {
- for (i = 0; i < 4; i++) {
- b = &xd->block[ib + iblock[i]];
- be = &x->block[ib + iblock[i]];
- vp9_subtract_b(be, b, 16);
- x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
- x->quantize_b_4x4(be, b);
- vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
- }
- }
-
- // reconstruct submacroblock
- for (i = 0; i < 4; i++) {
- b = &xd->block[ib + iblock[i]];
- vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
- b->dst_stride);
- }
-}
-
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- int i, ib;
-
- for (i = 0; i < 4; i++) {
- ib = vp9_i8x8_block[i];
- vp9_encode_intra8x8(rtcd, x, ib);
- }
-}
-
-void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,
- MACROBLOCK *x, int ib,
- int mode, int second) {
- BLOCKD *b = &x->e_mbd.block[ib];
- BLOCK *be = &x->block[ib];
-
-#if CONFIG_COMP_INTRA_PRED
- if (second == -1) {
-#endif
- vp9_intra_uv4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);
- }
-#endif
-
- vp9_subtract_b(be, b, 8);
-
- x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
- x->quantize_b_4x4(be, b);
- vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);
-
- vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
- b->dst_stride);
-}
-
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- int i, ib, mode, second;
- BLOCKD *b;
-
- for (i = 0; i < 4; i++) {
- ib = vp9_i8x8_block[i];
- b = &x->e_mbd.block[ib];
- mode = b->bmi.as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- second = b->bmi.as_mode.second;
-#else
- second = -1;
-#endif
- /*u */
- vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);
- /*v */
- vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);
- }
-}
--- a/vp8/encoder/encodeintra.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __ENCODEINTRA_H_
-#define __ENCODEINTRA_H_
-
-#include "onyx_int.h"
-
-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
- MACROBLOCK *x, int ib);
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
- MACROBLOCK *x, int ib);
-
-#endif // __ENCODEINTRA_H_
--- a/vp8/encoder/encodemb.c
+++ /dev/null
@@ -1,950 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_ports/config.h"
-#include "encodemb.h"
-#include "vp8/common/reconinter.h"
-#include "quantize.h"
-#include "tokenize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/reconintra.h"
-#include "vpx_mem/vpx_mem.h"
-#include "rdopt.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_rtcd.h"
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *src_ptr = (*(be->base_src) + be->src);
- short *diff_ptr = be->src_diff;
- unsigned char *pred_ptr = bd->predictor;
- int src_stride = be->src_stride;
-
- int r, c;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- diff_ptr[c] = src_ptr[c] - pred_ptr[c];
- }
-
- diff_ptr += pitch;
- pred_ptr += pitch;
- src_ptr += src_stride;
- }
-}
-
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *src_ptr = (*(be->base_src) + be->src);
- short *diff_ptr = be->src_diff;
- unsigned char *pred_ptr = bd->predictor;
- int src_stride = be->src_stride;
- int r, c;
-
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- diff_ptr[c] = src_ptr[c] - pred_ptr[c];
- }
- diff_ptr += pitch;
- pred_ptr += pitch;
- src_ptr += src_stride;
- }
-}
-
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
- const unsigned char *vsrc, int src_stride,
- const unsigned char *upred,
- const unsigned char *vpred, int dst_stride) {
- short *udiff = diff + 256;
- short *vdiff = diff + 320;
- int r, c;
-
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- udiff[c] = usrc[c] - upred[c];
- }
-
- udiff += 8;
- upred += dst_stride;
- usrc += src_stride;
- }
-
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- vdiff[c] = vsrc[c] - vpred[c];
- }
-
- vdiff += 8;
- vpred += dst_stride;
- vsrc += src_stride;
- }
-}
-
-void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,
- unsigned char *vsrc, unsigned char *pred, int stride) {
- unsigned char *upred = pred + 256;
- unsigned char *vpred = pred + 320;
-
- vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
-}
-
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
- const unsigned char *pred, int dst_stride) {
- int r, c;
-
- for (r = 0; r < 16; r++) {
- for (c = 0; c < 16; c++) {
- diff[c] = src[c] - pred[c];
- }
-
- diff += 16;
- pred += dst_stride;
- src += src_stride;
- }
-}
-
-void vp9_subtract_mby_c(short *diff, unsigned char *src,
- unsigned char *pred, int stride) {
- vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
-}
-
-static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- BLOCK *b = &x->block[0];
-
- vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
- b->src_stride);
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
-}
-
-static void build_dcblock_4x4(MACROBLOCK *x) {
- short *src_diff_ptr = &x->src_diff[384];
- int i;
-
- for (i = 0; i < 16; i++) {
- src_diff_ptr[i] = x->coeff[i * 16];
- }
-}
-
-void vp9_transform_mby_4x4(MACROBLOCK *x) {
- int i;
-
- for (i = 0; i < 16; i += 2) {
- x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
- &x->block[i].coeff[0], 32);
- }
-
- if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
- // build dc block from 16 y dc values
- build_dcblock_4x4(x);
-
- // do 2nd order transform on the dc block
- x->short_walsh4x4(&x->block[24].src_diff[0],
- &x->block[24].coeff[0], 8);
- }
-}
-
-void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
- int i;
-
- for (i = 16; i < 24; i += 2) {
- x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
- &x->block[i].coeff[0], 16);
- }
-}
-
-static void transform_mb_4x4(MACROBLOCK *x) {
- vp9_transform_mby_4x4(x);
- vp9_transform_mbuv_4x4(x);
-}
-
-static void build_dcblock_8x8(MACROBLOCK *x) {
- int16_t *src_diff_ptr = x->block[24].src_diff;
- int i;
-
- for (i = 0; i < 16; i++) {
- src_diff_ptr[i] = 0;
- }
- src_diff_ptr[0] = x->coeff[0 * 16];
- src_diff_ptr[1] = x->coeff[4 * 16];
- src_diff_ptr[4] = x->coeff[8 * 16];
- src_diff_ptr[8] = x->coeff[12 * 16];
-}
-
-void vp9_transform_mby_8x8(MACROBLOCK *x) {
- int i;
-
- for (i = 0; i < 9; i += 8) {
- x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
- &x->block[i].coeff[0], 32);
- }
- for (i = 2; i < 11; i += 8) {
- x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
- &x->block[i + 2].coeff[0], 32);
- }
-
- if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
- // build dc block from 2x2 y dc values
- build_dcblock_8x8(x);
-
- // do 2nd order transform on the dc block
- x->short_fhaar2x2(&x->block[24].src_diff[0],
- &x->block[24].coeff[0], 8);
- }
-}
-
-void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
- int i;
-
- for (i = 16; i < 24; i += 4) {
- x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
- &x->block[i].coeff[0], 16);
- }
-}
-
-void vp9_transform_mb_8x8(MACROBLOCK *x) {
- vp9_transform_mby_8x8(x);
- vp9_transform_mbuv_8x8(x);
-}
-
-void vp9_transform_mby_16x16(MACROBLOCK *x) {
- vp9_clear_system_state();
- x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
- &x->block[0].coeff[0], 32);
-}
-
-void vp9_transform_mb_16x16(MACROBLOCK *x) {
- vp9_transform_mby_16x16(x);
- vp9_transform_mbuv_8x8(x);
-}
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-typedef struct vp9_token_state vp9_token_state;
-
-struct vp9_token_state {
- int rate;
- int error;
- int next;
- signed char token;
- short qc;
-};
-
-// TODO: experiments to find optimal multiple numbers
-#define Y1_RD_MULT 4
-#define UV_RD_MULT 2
-#define Y2_RD_MULT 4
-
-static const int plane_rd_mult[4] = {
- Y1_RD_MULT,
- Y2_RD_MULT,
- UV_RD_MULT,
- Y1_RD_MULT
-};
-
-#define UPDATE_RD_COST()\
-{\
- rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
- rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
- if (rd_cost0 == rd_cost1) {\
- rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
- rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
- }\
-}
-
-static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- const VP9_ENCODER_RTCD *rtcd, int tx_size) {
- BLOCK *b;
- BLOCKD *d;
- vp9_token_state tokens[65][2];
- uint64_t best_mask[2];
- const short *dequant_ptr;
- const short *coeff_ptr;
- short *qcoeff_ptr;
- short *dqcoeff_ptr;
- int eob;
- int i0;
- int rc;
- int x;
- int sz = 0;
- int next;
- int rdmult;
- int rddiv;
- int final_eob;
- int64_t rd_cost0, rd_cost1;
- int rate0, rate1;
- int error0, error1;
- int t0, t1;
- int best;
- int band;
- int pt;
- int err_mult = plane_rd_mult[type];
- int default_eob;
- int const *scan, *bands;
-
- b = &mb->block[i];
- d = &mb->e_mbd.block[i];
- switch (tx_size) {
- default:
- case TX_4X4:
- scan = vp9_default_zig_zag1d;
- bands = vp9_coef_bands;
- default_eob = 16;
- // TODO: this isn't called (for intra4x4 modes), but will be left in
- // since it could be used later
- {
- TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);
- if (tx_type != DCT_DCT) {
- switch (tx_type) {
- case ADST_DCT:
- scan = vp9_row_scan;
- break;
-
- case DCT_ADST:
- scan = vp9_col_scan;
- break;
-
- default:
- scan = vp9_default_zig_zag1d;
- break;
- }
- } else {
- scan = vp9_default_zig_zag1d;
- }
- }
- break;
- case TX_8X8:
- scan = vp9_default_zig_zag1d_8x8;
- bands = vp9_coef_bands_8x8;
- default_eob = 64;
- break;
- }
-
- dequant_ptr = d->dequant;
- coeff_ptr = b->coeff;
- qcoeff_ptr = d->qcoeff;
- dqcoeff_ptr = d->dqcoeff;
- i0 = (type == PLANE_TYPE_Y_NO_DC);
- eob = d->eob;
-
- /* Now set up a Viterbi trellis to evaluate alternative roundings. */
- rdmult = mb->rdmult * err_mult;
- if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
- rdmult = (rdmult * 9) >> 4;
- rddiv = mb->rddiv;
- best_mask[0] = best_mask[1] = 0;
- /* Initialize the sentinel node of the trellis. */
- tokens[eob][0].rate = 0;
- tokens[eob][0].error = 0;
- tokens[eob][0].next = default_eob;
- tokens[eob][0].token = DCT_EOB_TOKEN;
- tokens[eob][0].qc = 0;
- *(tokens[eob] + 1) = *(tokens[eob] + 0);
- next = eob;
- for (i = eob; i-- > i0;) {
- int base_bits;
- int d2;
- int dx;
-
- rc = scan[i];
- x = qcoeff_ptr[rc];
- /* Only add a trellis state for non-zero coefficients. */
- if (x) {
- int shortcut = 0;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- /* Evaluate the first possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- t0 = (vp9_dct_value_tokens_ptr + x)->Token;
- /* Consider both possible successor states. */
- if (next < default_eob) {
- band = bands[i + 1];
- pt = vp9_prev_token_class[t0];
- rate0 +=
- mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
- rate1 +=
- mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- base_bits = *(vp9_dct_value_cost_ptr + x);
- dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
- d2 = dx * dx;
- tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][0].error = d2 + (best ? error1 : error0);
- tokens[i][0].next = next;
- tokens[i][0].token = t0;
- tokens[i][0].qc = x;
- best_mask[0] |= best << i;
- /* Evaluate the second possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
-
- if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&
- (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))
- shortcut = 1;
- else
- shortcut = 0;
-
- if (shortcut) {
- sz = -(x < 0);
- x -= 2 * sz + 1;
- }
-
- /* Consider both possible successor states. */
- if (!x) {
- /* If we reduced this coefficient to zero, check to see if
- * we need to move the EOB back here.
- */
- t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
- DCT_EOB_TOKEN : ZERO_TOKEN;
- t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
- DCT_EOB_TOKEN : ZERO_TOKEN;
- } else {
- t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
- }
- if (next < default_eob) {
- band = bands[i + 1];
- if (t0 != DCT_EOB_TOKEN) {
- pt = vp9_prev_token_class[t0];
- rate0 += mb->token_costs[tx_size][type][band][pt][
- tokens[next][0].token];
- }
- if (t1 != DCT_EOB_TOKEN) {
- pt = vp9_prev_token_class[t1];
- rate1 += mb->token_costs[tx_size][type][band][pt][
- tokens[next][1].token];
- }
- }
-
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- base_bits = *(vp9_dct_value_cost_ptr + x);
-
- if (shortcut) {
- dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
- d2 = dx * dx;
- }
- tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][1].error = d2 + (best ? error1 : error0);
- tokens[i][1].next = next;
- tokens[i][1].token = best ? t1 : t0;
- tokens[i][1].qc = x;
- best_mask[1] |= best << i;
- /* Finally, make this the new head of the trellis. */
- next = i;
- }
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- else {
- band = bands[i + 1];
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- /* Update the cost of each path if we're past the EOB token. */
- if (t0 != DCT_EOB_TOKEN) {
- tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
- tokens[next][0].token = ZERO_TOKEN;
- }
- if (t1 != DCT_EOB_TOKEN) {
- tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
- tokens[next][1].token = ZERO_TOKEN;
- }
- /* Don't update next, because we didn't add a new node. */
- }
- }
-
- /* Now pick the best path through the whole trellis. */
- band = bands[i + 1];
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- rate0 += mb->token_costs[tx_size][type][band][pt][t0];
- rate1 += mb->token_costs[tx_size][type][band][pt][t1];
- UPDATE_RD_COST();
- best = rd_cost1 < rd_cost0;
- final_eob = i0 - 1;
- for (i = next; i < eob; i = next) {
- x = tokens[i][best].qc;
- if (x)
- final_eob = i;
- rc = scan[i];
- qcoeff_ptr[rc] = x;
- dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
-
- next = tokens[i][best].next;
- best = (best_mask[best] >> i) & 1;
- }
- final_eob++;
-
- d->eob = final_eob;
- *a = *l = (d->eob != !type);
-}
-
-/**************************************************************************
-our inverse hadamard transform effectively is weighted sum of all 16 inputs
-with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And
-dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the
-output after inverse wht and idct will be all zero. A sum of absolute value
-smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht
-fall between -65 and +65.
-**************************************************************************/
-#define SUM_2ND_COEFF_THRESH 65
-
-static void check_reset_2nd_coeffs(MACROBLOCKD *xd,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
- int sum = 0;
- int i;
- BLOCKD *bd = &xd->block[24];
- if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH
- && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)
- return;
-
- for (i = 0; i < bd->eob; i++) {
- int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];
- sum += (coef >= 0) ? coef : -coef;
- if (sum >= SUM_2ND_COEFF_THRESH)
- return;
- }
-
- if (sum < SUM_2ND_COEFF_THRESH) {
- for (i = 0; i < bd->eob; i++) {
- int rc = vp9_default_zig_zag1d[i];
- bd->qcoeff[rc] = 0;
- bd->dqcoeff[rc] = 0;
- }
- bd->eob = 0;
- *a = *l = (bd->eob != 0);
- }
-}
-
-#define SUM_2ND_COEFF_THRESH_8X8 32
-static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
- int sum = 0;
- BLOCKD *bd = &xd->block[24];
- int coef;
-
- coef = bd->dqcoeff[0];
- sum += (coef >= 0) ? coef : -coef;
- coef = bd->dqcoeff[1];
- sum += (coef >= 0) ? coef : -coef;
- coef = bd->dqcoeff[4];
- sum += (coef >= 0) ? coef : -coef;
- coef = bd->dqcoeff[8];
- sum += (coef >= 0) ? coef : -coef;
-
- if (sum < SUM_2ND_COEFF_THRESH_8X8) {
- bd->qcoeff[0] = 0;
- bd->dqcoeff[0] = 0;
- bd->qcoeff[1] = 0;
- bd->dqcoeff[1] = 0;
- bd->qcoeff[4] = 0;
- bd->dqcoeff[4] = 0;
- bd->qcoeff[8] = 0;
- bd->dqcoeff[8] = 0;
- bd->eob = 0;
- *a = *l = (bd->eob != 0);
- }
-}
-
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- int b;
- PLANE_TYPE type;
- int has_2nd_order;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
- MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;
-
- if (!x->e_mbd.above_context || !x->e_mbd.left_context)
- return;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);
- type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
-
- for (b = 0; b < 16; b++) {
- optimize_b(x, b, type,
- ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
- }
-
- if (has_2nd_order) {
- b = 24;
- optimize_b(x, b, PLANE_TYPE_Y2,
- ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
- check_reset_2nd_coeffs(&x->e_mbd,
- ta + vp9_block2above[b], tl + vp9_block2left[b]);
- }
-}
-
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- int b;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
-
- if (!x->e_mbd.above_context || !x->e_mbd.left_context)
- return;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- for (b = 16; b < 24; b++) {
- optimize_b(x, b, PLANE_TYPE_UV,
- ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
- }
-}
-
-static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- vp9_optimize_mby_4x4(x, rtcd);
- vp9_optimize_mbuv_4x4(x, rtcd);
-}
-
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- int b;
- PLANE_TYPE type;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
- int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
- if (!x->e_mbd.above_context || !x->e_mbd.left_context)
- return;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
- for (b = 0; b < 16; b += 4) {
- optimize_b(x, b, type,
- ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
- rtcd, TX_8X8);
- ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
- tl[vp9_block2left_8x8[b] + 1] = tl[vp9_block2left_8x8[b]];
- }
-
- // 8x8 always have 2nd roder haar block
- if (has_2nd_order) {
- check_reset_8x8_2nd_coeffs(&x->e_mbd,
- ta + vp9_block2above_8x8[24],
- tl + vp9_block2left_8x8[24]);
- }
-}
-
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- int b;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
-
- if (!x->e_mbd.above_context || !x->e_mbd.left_context)
- return;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- for (b = 16; b < 24; b += 4) {
- optimize_b(x, b, PLANE_TYPE_UV,
- ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
- rtcd, TX_8X8);
- ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
- tl[vp9_block2left_8x8[b] + 1] = tl[vp9_block2left_8x8[b]];
- }
-}
-
-static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- vp9_optimize_mby_8x8(x, rtcd);
- vp9_optimize_mbuv_8x8(x, rtcd);
-}
-
-static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- const VP9_ENCODER_RTCD *rtcd) {
- BLOCK *b = &mb->block[i];
- BLOCKD *d = &mb->e_mbd.block[i];
- vp9_token_state tokens[257][2];
- unsigned best_index[257][2];
- const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
- short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
- int eob = d->eob, final_eob, sz = 0;
- int rc, x, next;
- int64_t rdmult, rddiv, rd_cost0, rd_cost1;
- int rate0, rate1, error0, error1, t0, t1;
- int best, band, pt;
- int err_mult = plane_rd_mult[type];
-
- /* Now set up a Viterbi trellis to evaluate alternative roundings. */
- rdmult = mb->rdmult * err_mult;
- if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
- rdmult = (rdmult * 9)>>4;
- rddiv = mb->rddiv;
- memset(best_index, 0, sizeof(best_index));
- /* Initialize the sentinel node of the trellis. */
- tokens[eob][0].rate = 0;
- tokens[eob][0].error = 0;
- tokens[eob][0].next = 256;
- tokens[eob][0].token = DCT_EOB_TOKEN;
- tokens[eob][0].qc = 0;
- *(tokens[eob] + 1) = *(tokens[eob] + 0);
- next = eob;
- for (i = eob; i-- > 0;) {
- int base_bits, d2, dx;
-
- rc = vp9_default_zig_zag1d_16x16[i];
- x = qcoeff_ptr[rc];
- /* Only add a trellis state for non-zero coefficients. */
- if (x) {
- int shortcut = 0;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- /* Evaluate the first possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- t0 = (vp9_dct_value_tokens_ptr + x)->Token;
- /* Consider both possible successor states. */
- if (next < 256) {
- band = vp9_coef_bands_16x16[i + 1];
- pt = vp9_prev_token_class[t0];
- rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];
- rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- base_bits = *(vp9_dct_value_cost_ptr + x);
- dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
- d2 = dx*dx;
- tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][0].error = d2 + (best ? error1 : error0);
- tokens[i][0].next = next;
- tokens[i][0].token = t0;
- tokens[i][0].qc = x;
- best_index[i][0] = best;
- /* Evaluate the second possibility for this state. */
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
-
- if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
- (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
- shortcut = 1;
- else
- shortcut = 0;
-
- if (shortcut) {
- sz = -(x < 0);
- x -= 2*sz + 1;
- }
-
- /* Consider both possible successor states. */
- if (!x) {
- /* If we reduced this coefficient to zero, check to see if
- * we need to move the EOB back here.
- */
- t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
- DCT_EOB_TOKEN : ZERO_TOKEN;
- t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
- DCT_EOB_TOKEN : ZERO_TOKEN;
- }
- else
- t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;
- if (next < 256) {
- band = vp9_coef_bands_16x16[i + 1];
- if (t0 != DCT_EOB_TOKEN) {
- pt = vp9_prev_token_class[t0];
- rate0 += mb->token_costs[TX_16X16][type][band][pt]
- [tokens[next][0].token];
- }
- if (t1!=DCT_EOB_TOKEN) {
- pt = vp9_prev_token_class[t1];
- rate1 += mb->token_costs[TX_16X16][type][band][pt]
- [tokens[next][1].token];
- }
- }
- UPDATE_RD_COST();
- /* And pick the best. */
- best = rd_cost1 < rd_cost0;
- base_bits = *(vp9_dct_value_cost_ptr + x);
-
- if(shortcut) {
- dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
- d2 = dx*dx;
- }
- tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
- tokens[i][1].error = d2 + (best ? error1 : error0);
- tokens[i][1].next = next;
- tokens[i][1].token = best ? t1 : t0;
- tokens[i][1].qc = x;
- best_index[i][1] = best;
- /* Finally, make this the new head of the trellis. */
- next = i;
- }
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- else {
- band = vp9_coef_bands_16x16[i + 1];
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- /* Update the cost of each path if we're past the EOB token. */
- if (t0 != DCT_EOB_TOKEN) {
- tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];
- tokens[next][0].token = ZERO_TOKEN;
- }
- if (t1 != DCT_EOB_TOKEN) {
- tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];
- tokens[next][1].token = ZERO_TOKEN;
- }
- /* Don't update next, because we didn't add a new node. */
- }
- }
-
- /* Now pick the best path through the whole trellis. */
- band = vp9_coef_bands_16x16[i + 1];
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- rate0 = tokens[next][0].rate;
- rate1 = tokens[next][1].rate;
- error0 = tokens[next][0].error;
- error1 = tokens[next][1].error;
- t0 = tokens[next][0].token;
- t1 = tokens[next][1].token;
- rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];
- rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];
- UPDATE_RD_COST();
- best = rd_cost1 < rd_cost0;
- final_eob = -1;
-
- for (i = next; i < eob; i = next) {
- x = tokens[i][best].qc;
- if (x)
- final_eob = i;
- rc = vp9_default_zig_zag1d_16x16[i];
- qcoeff_ptr[rc] = x;
- dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
-
- next = tokens[i][best].next;
- best = best_index[i][best];
- }
- final_eob++;
-
- d->eob = final_eob;
- *a = *l = (d->eob != !type);
-}
-
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
-
- if (!x->e_mbd.above_context || !x->e_mbd.left_context)
- return;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
-}
-
-static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
- vp9_optimize_mby_16x16(x, rtcd);
- vp9_optimize_mbuv_8x8(x, rtcd);
-}
-
-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- MACROBLOCKD *xd = &x->e_mbd;
- TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
- vp9_build_inter_predictors_mb(xd);
- subtract_mb(rtcd, x);
-
- if (tx_size == TX_16X16) {
- vp9_transform_mb_16x16(x);
- vp9_quantize_mb_16x16(x);
- if (x->optimize)
- optimize_mb_16x16(x, rtcd);
- vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);
- } else if (tx_size == TX_8X8) {
- if (xd->mode_info_context->mbmi.mode == SPLITMV) {
- assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
- vp9_transform_mby_8x8(x);
- vp9_transform_mbuv_4x4(x);
- vp9_quantize_mby_8x8(x);
- vp9_quantize_mbuv_4x4(x);
- if (x->optimize) {
- vp9_optimize_mby_8x8(x, rtcd);
- vp9_optimize_mbuv_4x4(x, rtcd);
- }
- vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
- vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
- } else {
- vp9_transform_mb_8x8(x);
- vp9_quantize_mb_8x8(x);
- if (x->optimize)
- optimize_mb_8x8(x, rtcd);
- vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);
- }
- } else {
- transform_mb_4x4(x);
- vp9_quantize_mb_4x4(x);
- if (x->optimize)
- optimize_mb_4x4(x, rtcd);
- vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
- }
-
- vp9_recon_mb(xd);
-}
-
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
- MACROBLOCKD *xd = &x->e_mbd;
- BLOCK *b = &x->block[0];
-
-#if CONFIG_PRED_FILTER
- // Disable the prediction filter for firstpass
- xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
-
- vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-
- vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
- vp9_transform_mby_4x4(x);
- vp9_quantize_mby_4x4(x);
- vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
-
- vp9_recon_mby(xd);
-}
--- a/vp8/encoder/encodemb.h
+++ /dev/null
@@ -1,70 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENCODEMB_H
-#define __INC_ENCODEMB_H
-
-#include "vpx_ports/config.h"
-#include "block.h"
-
-typedef struct {
- MB_PREDICTION_MODE mode;
- MV_REFERENCE_FRAME ref_frame;
- MV_REFERENCE_FRAME second_ref_frame;
-#if CONFIG_PRED_FILTER
- int pred_filter_flag;
-#endif
-} MODE_DEFINITION;
-
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn
-#endif
-
-
-
-#include "onyx_int.h"
-struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-
-void vp9_transform_mbuv_4x4(MACROBLOCK *x);
-void vp9_transform_mby_4x4(MACROBLOCK *x);
-
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-
-void vp9_transform_mb_8x8(MACROBLOCK *mb);
-void vp9_transform_mby_8x8(MACROBLOCK *x);
-void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_build_dcblock_8x8(MACROBLOCK *b);
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-
-void vp9_transform_mb_16x16(MACROBLOCK *mb);
-void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
-
-#if CONFIG_SUPERBLOCKS
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
- const unsigned char *vsrc, int src_stride,
- const unsigned char *upred,
- const unsigned char *vpred, int dst_stride);
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
- int src_stride, const unsigned char *pred,
- int dst_stride);
-#endif
-
-#endif
--- a/vp8/encoder/encodemv.c
+++ /dev/null
@@ -1,547 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/common.h"
-#include "encodemv.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/systemdependent.h"
-
-#include <math.h>
-
-#ifdef ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
-#ifdef NMV_STATS
-nmv_context_counts tnmvcounts;
-#endif
-
-static void encode_nmv_component(vp9_writer* const bc,
- int v,
- int r,
- const nmv_component* const mvcomp) {
- int s, z, c, o, d;
- assert (v != 0); /* should not be zero */
- s = v < 0;
- vp9_write(bc, s, mvcomp->sign);
- z = (s ? -v : v) - 1; /* magnitude - 1 */
-
- c = vp9_get_mv_class(z, &o);
-
- write_token(bc, vp9_mv_class_tree, mvcomp->classes,
- vp9_mv_class_encodings + c);
-
- d = (o >> 3); /* int mv data */
-
- if (c == MV_CLASS_0) {
- write_token(bc, vp9_mv_class0_tree, mvcomp->class0,
- vp9_mv_class0_encodings + d);
- } else {
- int i, b;
- b = c + CLASS0_BITS - 1; /* number of bits */
- for (i = 0; i < b; ++i)
- vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
- }
-}
-
-static void encode_nmv_component_fp(vp9_writer *bc,
- int v,
- int r,
- const nmv_component* const mvcomp,
- int usehp) {
- int s, z, c, o, d, f, e;
- assert (v != 0); /* should not be zero */
- s = v < 0;
- z = (s ? -v : v) - 1; /* magnitude - 1 */
-
- c = vp9_get_mv_class(z, &o);
-
- d = (o >> 3); /* int mv data */
- f = (o >> 1) & 3; /* fractional pel mv data */
- e = (o & 1); /* high precision mv data */
-
- /* Code the fractional pel bits */
- if (c == MV_CLASS_0) {
- write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],
- vp9_mv_fp_encodings + f);
- } else {
- write_token(bc, vp9_mv_fp_tree, mvcomp->fp,
- vp9_mv_fp_encodings + f);
- }
- /* Code the high precision bit */
- if (usehp) {
- if (c == MV_CLASS_0) {
- vp9_write(bc, e, mvcomp->class0_hp);
- } else {
- vp9_write(bc, e, mvcomp->hp);
- }
- }
-}
-
-static void build_nmv_component_cost_table(int *mvcost,
- const nmv_component* const mvcomp,
- int usehp) {
- int i, v;
- int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
- int bits_cost[MV_OFFSET_BITS][2];
- int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
- int class0_hp_cost[2], hp_cost[2];
-
- sign_cost[0] = vp9_cost_zero(mvcomp->sign);
- sign_cost[1] = vp9_cost_one(mvcomp->sign);
- vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
- vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
- for (i = 0; i < MV_OFFSET_BITS; ++i) {
- bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
- bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
- }
-
- for (i = 0; i < CLASS0_SIZE; ++i)
- vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
- vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
-
- if (usehp) {
- class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
- class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
- hp_cost[0] = vp9_cost_zero(mvcomp->hp);
- hp_cost[1] = vp9_cost_one(mvcomp->hp);
- }
- mvcost[0] = 0;
- for (v = 1; v <= MV_MAX; ++v) {
- int z, c, o, d, e, f, cost = 0;
- z = v - 1;
- c = vp9_get_mv_class(z, &o);
- cost += class_cost[c];
- d = (o >> 3); /* int mv data */
- f = (o >> 1) & 3; /* fractional pel mv data */
- e = (o & 1); /* high precision mv data */
- if (c == MV_CLASS_0) {
- cost += class0_cost[d];
- } else {
- int i, b;
- b = c + CLASS0_BITS - 1; /* number of bits */
- for (i = 0; i < b; ++i)
- cost += bits_cost[i][((d >> i) & 1)];
- }
- if (c == MV_CLASS_0) {
- cost += class0_fp_cost[d][f];
- } else {
- cost += fp_cost[f];
- }
- if (usehp) {
- if (c == MV_CLASS_0) {
- cost += class0_hp_cost[e];
- } else {
- cost += hp_cost[e];
- }
- }
- mvcost[v] = cost + sign_cost[0];
- mvcost[-v] = cost + sign_cost[1];
- }
-}
-
-static int update_nmv_savings(const unsigned int ct[2],
- const vp9_prob cur_p,
- const vp9_prob new_p,
- const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
- vp9_prob mod_p = new_p | 1;
-#else
- vp9_prob mod_p = new_p;
-#endif
- const int cur_b = cost_branch256(ct, cur_p);
- const int mod_b = cost_branch256(ct, mod_p);
- const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
- 256 +
-#endif
- (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
- if (cur_b - mod_b - cost > 0) {
- return cur_b - mod_b - cost;
- } else {
- return -vp9_cost_zero(upd_p);
- }
-}
-
-static int update_nmv(
- vp9_writer *const bc,
- const unsigned int ct[2],
- vp9_prob *const cur_p,
- const vp9_prob new_p,
- const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
- vp9_prob mod_p = new_p | 1;
-#else
- vp9_prob mod_p = new_p;
-#endif
-
- const int cur_b = cost_branch256(ct, *cur_p);
- const int mod_b = cost_branch256(ct, mod_p);
- const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
- 256 +
-#endif
- (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-
- if (cur_b - mod_b > cost) {
- *cur_p = mod_p;
- vp9_write(bc, 1, upd_p);
-#ifdef LOW_PRECISION_MV_UPDATE
- vp9_write_literal(bc, mod_p >> 1, 7);
-#else
- vp9_write_literal(bc, mod_p, 8);
-#endif
- return 1;
- } else {
- vp9_write(bc, 0, upd_p);
- return 0;
- }
-}
-
-#ifdef NMV_STATS
-void init_nmvstats() {
- vp9_zero(tnmvcounts);
-}
-
-void print_nmvstats() {
- nmv_context prob;
- unsigned int branch_ct_joint[MV_JOINTS - 1][2];
- unsigned int branch_ct_sign[2][2];
- unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
- unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
- unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
- unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
- unsigned int branch_ct_fp[2][4 - 1][2];
- unsigned int branch_ct_class0_hp[2][2];
- unsigned int branch_ct_hp[2][2];
- int i, j, k;
- vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
-
- printf("\nCounts =\n { ");
- for (j = 0; j < MV_JOINTS; ++j)
- printf("%d, ", tnmvcounts.joints[j]);
- printf("},\n");
- for (i=0; i< 2; ++i) {
- printf(" {\n");
- printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0],
- tnmvcounts.comps[i].sign[1]);
- printf(" { ");
- for (j = 0; j < MV_CLASSES; ++j)
- printf("%d, ", tnmvcounts.comps[i].classes[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE; ++j)
- printf("%d, ", tnmvcounts.comps[i].class0[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
- tnmvcounts.comps[i].bits[j][1]);
- printf("},\n");
-
- printf(" {");
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 4; ++k)
- printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("},\n");
-
- printf(" { ");
- for (j = 0; j < 4; ++j)
- printf("%d, ", tnmvcounts.comps[i].fp[j]);
- printf("},\n");
-
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].class0_hp[0],
- tnmvcounts.comps[i].class0_hp[1]);
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].hp[0],
- tnmvcounts.comps[i].hp[1]);
- printf(" },\n");
- }
-
- printf("\nProbs =\n { ");
- for (j = 0; j < MV_JOINTS - 1; ++j)
- printf("%d, ", prob.joints[j]);
- printf("},\n");
- for (i=0; i< 2; ++i) {
- printf(" {\n");
- printf(" %d,\n", prob.comps[i].sign);
- printf(" { ");
- for (j = 0; j < MV_CLASSES - 1; ++j)
- printf("%d, ", prob.comps[i].classes[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE - 1; ++j)
- printf("%d, ", prob.comps[i].class0[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d, ", prob.comps[i].bits[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 3; ++k)
- printf("%d, ", prob.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("},\n");
- printf(" { ");
- for (j = 0; j < 3; ++j)
- printf("%d, ", prob.comps[i].fp[j]);
- printf("},\n");
-
- printf(" %d,\n", prob.comps[i].class0_hp);
- printf(" %d,\n", prob.comps[i].hp);
- printf(" },\n");
- }
-}
-
-static void add_nmvcount(nmv_context_counts* const dst,
- const nmv_context_counts* const src) {
- int i, j, k;
- for (j = 0; j < MV_JOINTS; ++j) {
- dst->joints[j] += src->joints[j];
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_VALS; ++j) {
- dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
- }
- dst->comps[i].sign[0] += src->comps[i].sign[0];
- dst->comps[i].sign[1] += src->comps[i].sign[1];
- for (j = 0; j < MV_CLASSES; ++j) {
- dst->comps[i].classes[j] += src->comps[i].classes[j];
- }
- for (j = 0; j < CLASS0_SIZE; ++j) {
- dst->comps[i].class0[j] += src->comps[i].class0[j];
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
- dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- for (k = 0; k < 4; ++k) {
- dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
- }
- }
- for (j = 0; j < 4; ++j) {
- dst->comps[i].fp[j] += src->comps[i].fp[j];
- }
- dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
- dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
- dst->comps[i].hp[0] += src->comps[i].hp[0];
- dst->comps[i].hp[1] += src->comps[i].hp[1];
- }
-}
-#endif
-
-void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
- int i, j;
- nmv_context prob;
- unsigned int branch_ct_joint[MV_JOINTS - 1][2];
- unsigned int branch_ct_sign[2][2];
- unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
- unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
- unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
- unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
- unsigned int branch_ct_fp[2][4 - 1][2];
- unsigned int branch_ct_class0_hp[2][2];
- unsigned int branch_ct_hp[2][2];
- int savings = 0;
-
-#ifdef NMV_STATS
- if (!cpi->dummy_packing)
- add_nmvcount(&tnmvcounts, &cpi->NMVcount);
-#endif
- vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
- /* write updates if they help */
-#ifdef MV_GROUP_UPDATE
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- savings += update_nmv_savings(branch_ct_joint[j],
- cpi->common.fc.nmvc.joints[j],
- prob.joints[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (i = 0; i < 2; ++i) {
- savings += update_nmv_savings(branch_ct_sign[i],
- cpi->common.fc.nmvc.comps[i].sign,
- prob.comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- savings += update_nmv_savings(branch_ct_classes[i][j],
- cpi->common.fc.nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- savings += update_nmv_savings(branch_ct_class0[i][j],
- cpi->common.fc.nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- savings += update_nmv_savings(branch_ct_bits[i][j],
- cpi->common.fc.nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- int k;
- for (k = 0; k < 3; ++k) {
- savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
- cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (j = 0; j < 3; ++j) {
- savings += update_nmv_savings(branch_ct_fp[i][j],
- cpi->common.fc.nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- savings += update_nmv_savings(branch_ct_class0_hp[i],
- cpi->common.fc.nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- savings += update_nmv_savings(branch_ct_hp[i],
- cpi->common.fc.nmvc.comps[i].hp,
- prob.comps[i].hp,
- VP9_NMV_UPDATE_PROB);
- }
- }
- if (savings <= 0) {
- vp9_write_bit(bc, 0);
- return;
- }
- vp9_write_bit(bc, 1);
-#endif
-
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- update_nmv(bc, branch_ct_joint[j],
- &cpi->common.fc.nmvc.joints[j],
- prob.joints[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_sign[i],
- &cpi->common.fc.nmvc.comps[i].sign,
- prob.comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- update_nmv(bc, branch_ct_classes[i][j],
- &cpi->common.fc.nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- update_nmv(bc, branch_ct_class0[i][j],
- &cpi->common.fc.nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- update_nmv(bc, branch_ct_bits[i][j],
- &cpi->common.fc.nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- int k;
- for (k = 0; k < 3; ++k) {
- update_nmv(bc, branch_ct_class0_fp[i][j][k],
- &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (j = 0; j < 3; ++j) {
- update_nmv(bc, branch_ct_fp[i][j],
- &cpi->common.fc.nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_class0_hp[i],
- &cpi->common.fc.nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- update_nmv(bc, branch_ct_hp[i],
- &cpi->common.fc.nmvc.comps[i].hp,
- prob.comps[i].hp,
- VP9_NMV_UPDATE_PROB);
- }
- }
-}
-
-void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,
- const MV* const ref, const nmv_context* const mvctx) {
- MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
- write_token(bc, vp9_mv_joint_tree, mvctx->joints,
- vp9_mv_joint_encodings + j);
- if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
- encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
- }
- if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
- encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
- }
-}
-
-void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,
- const MV* const ref, const nmv_context* const mvctx,
- int usehp) {
- MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
- usehp = usehp && vp9_use_nmv_hp(ref);
- if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
- encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
- }
- if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
- encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
- }
-}
-
-void vp9_build_nmv_cost_table(int *mvjoint,
- int *mvcost[2],
- const nmv_context* const mvctx,
- int usehp,
- int mvc_flag_v,
- int mvc_flag_h) {
- vp9_clear_system_state();
- vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
- if (mvc_flag_v)
- build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
- if (mvc_flag_h)
- build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
-}
--- a/vp8/encoder/encodemv.h
+++ /dev/null
@@ -1,30 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ENCODEMV_H
-#define __INC_ENCODEMV_H
-
-#include "onyx_int.h"
-
-void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);
-void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,
- const MV* const ref, const nmv_context* const mvctx);
-void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,
- const MV* const ref, const nmv_context *mvctx,
- int usehp);
-void vp9_build_nmv_cost_table(int *mvjoint,
- int *mvcost[2],
- const nmv_context *mvctx,
- int usehp,
- int mvc_flag_v,
- int mvc_flag_h);
-
-#endif
--- a/vp8/encoder/firstpass.c
+++ /dev/null
@@ -1,2533 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "math.h"
-#include "limits.h"
-#include "block.h"
-#include "onyx_int.h"
-#include "variance.h"
-#include "encodeintra.h"
-#include "vp8/common/setupintrarecon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "vpx_scale/vpxscale.h"
-#include "encodemb.h"
-#include "vp8/common/extend.h"
-#include "vp8/common/systemdependent.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include <stdio.h>
-#include "rdopt.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "vp8/common/entropymv.h"
-#include "encodemv.h"
-
-#define OUTPUT_FPF 0
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-extern void vp9_build_block_offsets(MACROBLOCK *x);
-
-extern void vp9_setup_block_ptrs(MACROBLOCK *x);
-
-extern void vp9_frame_init_quantizer(VP9_COMP *cpi);
-
-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,
- int_mv *mv);
-
-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
-
-#define IIFACTOR 12.5
-#define IIKFACTOR1 12.5
-#define IIKFACTOR2 15.0
-#define RMAX 128.0
-#define GF_RMAX 96.0
-#define ERR_DIVISOR 150.0
-
-#define KF_MB_INTRA_MIN 300
-#define GF_MB_INTRA_MIN 200
-
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
-
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
-
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
-
-static int select_cq_level(int qindex) {
- int ret_val = QINDEX_RANGE - 1;
- int i;
-
- double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (target_q <= vp9_convert_qindex_to_q(i)) {
- ret_val = i;
- break;
- }
- }
-
- return ret_val;
-}
-
-
-// Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
- cpi->twopass.stats_in = Position;
-}
-
-static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
- if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
- return EOF;
-
- *next_frame = *cpi->twopass.stats_in;
- return 1;
-}
-
-// Read frame stats at an offset from the current position
-static int read_frame_stats(VP9_COMP *cpi,
- FIRSTPASS_STATS *frame_stats,
- int offset) {
- FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;
-
- // Check legality of offset
- if (offset >= 0) {
- if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)
- return EOF;
- } else if (offset < 0) {
- if (&fps_ptr[offset] < cpi->twopass.stats_in_start)
- return EOF;
- }
-
- *frame_stats = fps_ptr[offset];
- return 1;
-}
-
-static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {
- if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
- return EOF;
-
- *fps = *cpi->twopass.stats_in;
- cpi->twopass.stats_in =
- (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
- return 1;
-}
-
-static void output_stats(const VP9_COMP *cpi,
- struct vpx_codec_pkt_list *pktlist,
- FIRSTPASS_STATS *stats) {
- struct vpx_codec_cx_pkt pkt;
- pkt.kind = VPX_CODEC_STATS_PKT;
- pkt.data.twopass_stats.buf = stats;
- pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
- vpx_codec_pkt_list_add(pktlist, &pkt);
-
-// TEMP debug code
-#if OUTPUT_FPF
-
- {
- FILE *fpfile;
- fpfile = fopen("firstpass.stt", "a");
-
- fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
- "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
- "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
- stats->frame,
- stats->intra_error,
- stats->coded_error,
- stats->sr_coded_error,
- stats->ssim_weighted_pred_err,
- stats->pcnt_inter,
- stats->pcnt_motion,
- stats->pcnt_second_ref,
- stats->pcnt_neutral,
- stats->MVr,
- stats->mvr_abs,
- stats->MVc,
- stats->mvc_abs,
- stats->MVrv,
- stats->MVcv,
- stats->mv_in_out_count,
- stats->new_mv_count,
- stats->count,
- stats->duration);
- fclose(fpfile);
- }
-#endif
-}
-
-static void zero_stats(FIRSTPASS_STATS *section) {
- section->frame = 0.0;
- section->intra_error = 0.0;
- section->coded_error = 0.0;
- section->sr_coded_error = 0.0;
- section->ssim_weighted_pred_err = 0.0;
- section->pcnt_inter = 0.0;
- section->pcnt_motion = 0.0;
- section->pcnt_second_ref = 0.0;
- section->pcnt_neutral = 0.0;
- section->MVr = 0.0;
- section->mvr_abs = 0.0;
- section->MVc = 0.0;
- section->mvc_abs = 0.0;
- section->MVrv = 0.0;
- section->MVcv = 0.0;
- section->mv_in_out_count = 0.0;
- section->new_mv_count = 0.0;
- section->count = 0.0;
- section->duration = 1.0;
-}
-
-static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
- section->frame += frame->frame;
- section->intra_error += frame->intra_error;
- section->coded_error += frame->coded_error;
- section->sr_coded_error += frame->sr_coded_error;
- section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
- section->pcnt_inter += frame->pcnt_inter;
- section->pcnt_motion += frame->pcnt_motion;
- section->pcnt_second_ref += frame->pcnt_second_ref;
- section->pcnt_neutral += frame->pcnt_neutral;
- section->MVr += frame->MVr;
- section->mvr_abs += frame->mvr_abs;
- section->MVc += frame->MVc;
- section->mvc_abs += frame->mvc_abs;
- section->MVrv += frame->MVrv;
- section->MVcv += frame->MVcv;
- section->mv_in_out_count += frame->mv_in_out_count;
- section->new_mv_count += frame->new_mv_count;
- section->count += frame->count;
- section->duration += frame->duration;
-}
-
-static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
- section->frame -= frame->frame;
- section->intra_error -= frame->intra_error;
- section->coded_error -= frame->coded_error;
- section->sr_coded_error -= frame->sr_coded_error;
- section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
- section->pcnt_inter -= frame->pcnt_inter;
- section->pcnt_motion -= frame->pcnt_motion;
- section->pcnt_second_ref -= frame->pcnt_second_ref;
- section->pcnt_neutral -= frame->pcnt_neutral;
- section->MVr -= frame->MVr;
- section->mvr_abs -= frame->mvr_abs;
- section->MVc -= frame->MVc;
- section->mvc_abs -= frame->mvc_abs;
- section->MVrv -= frame->MVrv;
- section->MVcv -= frame->MVcv;
- section->mv_in_out_count -= frame->mv_in_out_count;
- section->new_mv_count -= frame->new_mv_count;
- section->count -= frame->count;
- section->duration -= frame->duration;
-}
-
-static void avg_stats(FIRSTPASS_STATS *section) {
- if (section->count < 1.0)
- return;
-
- section->intra_error /= section->count;
- section->coded_error /= section->count;
- section->sr_coded_error /= section->count;
- section->ssim_weighted_pred_err /= section->count;
- section->pcnt_inter /= section->count;
- section->pcnt_second_ref /= section->count;
- section->pcnt_neutral /= section->count;
- section->pcnt_motion /= section->count;
- section->MVr /= section->count;
- section->mvr_abs /= section->count;
- section->MVc /= section->count;
- section->mvc_abs /= section->count;
- section->MVrv /= section->count;
- section->MVcv /= section->count;
- section->mv_in_out_count /= section->count;
- section->duration /= section->count;
-}
-
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
- cpi->twopass.total_stats->count);
- double this_err = this_frame->ssim_weighted_pred_err;
- double modified_err;
-
- if (this_err > av_err)
- modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
- else
- modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
- return modified_err;
-}
-
-static const double weight_table[256] = {
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
- 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
- 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
- 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
-};
-
-static double simple_weight(YV12_BUFFER_CONFIG *source) {
- int i, j;
-
- unsigned char *src = source->y_buffer;
- double sum_weights = 0.0;
-
- // Loop throught the Y plane raw examining levels and creating a weight for the image
- i = source->y_height;
- do {
- j = source->y_width;
- do {
- sum_weights += weight_table[ *src];
- src++;
- } while (--j);
- src -= source->y_width;
- src += source->y_stride;
- } while (--i);
-
- sum_weights /= (source->y_height * source->y_width);
-
- return sum_weights;
-}
-
-
-// This function returns the current per frame maximum bitrate target
-static int frame_max_bits(VP9_COMP *cpi) {
- // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
- int max_bits;
-
- // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
- max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
-
- // Trap case where we are out of bits
- if (max_bits < 0)
- max_bits = 0;
-
- return max_bits;
-}
-
-void vp9_init_first_pass(VP9_COMP *cpi) {
- zero_stats(cpi->twopass.total_stats);
-}
-
-void vp9_end_first_pass(VP9_COMP *cpi) {
- output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
-}
-
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
- MACROBLOCKD *const xd = &x->e_mbd;
- BLOCK *b = &x->block[0];
- BLOCKD *d = &x->e_mbd.block[0];
-
- unsigned char *src_ptr = (*(b->base_src) + b->src);
- int src_stride = b->src_stride;
- unsigned char *ref_ptr;
- int ref_stride = d->pre_stride;
-
- // Set up pointers for this macro block recon buffer
- xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
-
- ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
-
- vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
- (unsigned int *)(best_motion_err));
-}
-
-static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *ref_mv, MV *best_mv,
- YV12_BUFFER_CONFIG *recon_buffer,
- int *best_motion_err, int recon_yoffset) {
- MACROBLOCKD *const xd = &x->e_mbd;
- BLOCK *b = &x->block[0];
- BLOCKD *d = &x->e_mbd.block[0];
- int num00;
-
- int_mv tmp_mv;
- int_mv ref_mv_full;
-
- int tmp_err;
- int step_param = 3;
- int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
- int n;
- vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
- int new_mv_mode_penalty = 256;
-
- // override the default variance function to use MSE
- v_fn_ptr.vf = vp9_mse16x16;
-
- // Set up pointers for this macro block recon buffer
- xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
-
- // Initial step/diamond search centred on best mv
- tmp_mv.as_int = 0;
- ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
- ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
- tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
- x->sadperbit16, &num00, &v_fn_ptr,
- XMVCOST, ref_mv);
- if (tmp_err < INT_MAX - new_mv_mode_penalty)
- tmp_err += new_mv_mode_penalty;
-
- if (tmp_err < *best_motion_err) {
- *best_motion_err = tmp_err;
- best_mv->row = tmp_mv.as_mv.row;
- best_mv->col = tmp_mv.as_mv.col;
- }
-
- // Further step/diamond searches as necessary
- n = num00;
- num00 = 0;
-
- while (n < further_steps) {
- n++;
-
- if (num00)
- num00--;
- else {
- tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
- step_param + n, x->sadperbit16,
- &num00, &v_fn_ptr,
- XMVCOST, ref_mv);
- if (tmp_err < INT_MAX - new_mv_mode_penalty)
- tmp_err += new_mv_mode_penalty;
-
- if (tmp_err < *best_motion_err) {
- *best_motion_err = tmp_err;
- best_mv->row = tmp_mv.as_mv.row;
- best_mv->col = tmp_mv.as_mv.col;
- }
- }
- }
-}
-
-void vp9_first_pass(VP9_COMP *cpi) {
- int mb_row, mb_col;
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
-
- int recon_yoffset, recon_uvoffset;
- YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
- YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
- YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
- int recon_y_stride = lst_yv12->y_stride;
- int recon_uv_stride = lst_yv12->uv_stride;
- int64_t intra_error = 0;
- int64_t coded_error = 0;
- int64_t sr_coded_error = 0;
-
- int sum_mvr = 0, sum_mvc = 0;
- int sum_mvr_abs = 0, sum_mvc_abs = 0;
- int sum_mvrs = 0, sum_mvcs = 0;
- int mvcount = 0;
- int intercount = 0;
- int second_ref_count = 0;
- int intrapenalty = 256;
- int neutral_count = 0;
- int new_mv_count = 0;
- int sum_in_vectors = 0;
- uint32_t lastmv_as_int = 0;
-
- int_mv zero_ref_mv;
-
- zero_ref_mv.as_int = 0;
-
- vp9_clear_system_state(); // __asm emms;
-
- x->src = * cpi->Source;
- xd->pre = *lst_yv12;
- xd->dst = *new_yv12;
-
- x->partition_info = x->pi;
-
- xd->mode_info_context = cm->mi;
-
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd);
-
- vp9_setup_block_ptrs(x);
-
- // set up frame new frame for intra coded blocks
- vp9_setup_intra_recon(new_yv12);
- vp9_frame_init_quantizer(cpi);
-
- // Initialise the MV cost table to the defaults
- // if( cm->current_video_frame == 0)
- // if ( 0 )
- {
- int flag[2] = {1, 1};
- vp9_init_mv_probs(cm);
- vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
- }
-
- // for each macroblock row in image
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- int_mv best_ref_mv;
-
- best_ref_mv.as_int = 0;
-
- // reset above block coeffs
- xd->up_available = (mb_row != 0);
- recon_yoffset = (mb_row * recon_y_stride * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * 8);
-
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
- x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
-
-
- // for each macroblock col in image
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- int this_error;
- int gf_motion_error = INT_MAX;
- int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-
- xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
- xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
- xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
- xd->left_available = (mb_col != 0);
-
- // Copy current mb to a buffer
- vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-
- // do intra 16x16 prediction
- this_error = vp9_encode_intra(cpi, x, use_dc_pred);
-
- // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
- // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
- // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
- // This penalty adds a cost matching that of a 0,0 mv to the intra case.
- this_error += intrapenalty;
-
- // Cumulative intra error total
- intra_error += (int64_t)this_error;
-
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
- x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-
- // Other than for the first frame do a motion search
- if (cm->current_video_frame > 0) {
- int tmp_err;
- int motion_error = INT_MAX;
- int_mv mv, tmp_mv;
-
- // Simple 0,0 motion with no mv overhead
- zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
- mv.as_int = tmp_mv.as_int = 0;
-
- // Test last reference frame using the previous best mv as the
- // starting point (best reference) for the search
- first_pass_motion_search(cpi, x, &best_ref_mv,
- &mv.as_mv, lst_yv12,
- &motion_error, recon_yoffset);
-
- // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
- if (best_ref_mv.as_int) {
- tmp_err = INT_MAX;
- first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
- lst_yv12, &tmp_err, recon_yoffset);
-
- if (tmp_err < motion_error) {
- motion_error = tmp_err;
- mv.as_int = tmp_mv.as_int;
- }
- }
-
- // Experimental search in an older reference frame
- if (cm->current_video_frame > 1) {
- // Simple 0,0 motion with no mv overhead
- zz_motion_search(cpi, x, gld_yv12,
- &gf_motion_error, recon_yoffset);
-
- first_pass_motion_search(cpi, x, &zero_ref_mv,
- &tmp_mv.as_mv, gld_yv12,
- &gf_motion_error, recon_yoffset);
-
- if ((gf_motion_error < motion_error) &&
- (gf_motion_error < this_error)) {
- second_ref_count++;
- }
-
- // Reset to last frame as reference buffer
- xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
- xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
- xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
-
- // In accumulating a score for the older reference frame
- // take the best of the motion predicted score and
- // the intra coded error (just as will be done for)
- // accumulation of "coded_error" for the last frame.
- if (gf_motion_error < this_error)
- sr_coded_error += gf_motion_error;
- else
- sr_coded_error += this_error;
- } else
- sr_coded_error += motion_error;
-
- /* Intra assumed best */
- best_ref_mv.as_int = 0;
-
- if (motion_error <= this_error) {
- // Keep a count of cases where the inter and intra were
- // very close and very low. This helps with scene cut
- // detection for example in cropped clips with black bars
- // at the sides or top and bottom.
- if ((((this_error - intrapenalty) * 9) <=
- (motion_error * 10)) &&
- (this_error < (2 * intrapenalty))) {
- neutral_count++;
- }
-
- mv.as_mv.row <<= 3;
- mv.as_mv.col <<= 3;
- this_error = motion_error;
- vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
- vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
- sum_mvr += mv.as_mv.row;
- sum_mvr_abs += abs(mv.as_mv.row);
- sum_mvc += mv.as_mv.col;
- sum_mvc_abs += abs(mv.as_mv.col);
- sum_mvrs += mv.as_mv.row * mv.as_mv.row;
- sum_mvcs += mv.as_mv.col * mv.as_mv.col;
- intercount++;
-
- best_ref_mv.as_int = mv.as_int;
-
- // Was the vector non-zero
- if (mv.as_int) {
- mvcount++;
-
- // Was it different from the last non zero vector
- if (mv.as_int != lastmv_as_int)
- new_mv_count++;
- lastmv_as_int = mv.as_int;
-
- // Does the Row vector point inwards or outwards
- if (mb_row < cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
- sum_in_vectors--;
- else if (mv.as_mv.row < 0)
- sum_in_vectors++;
- } else if (mb_row > cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
- sum_in_vectors++;
- else if (mv.as_mv.row < 0)
- sum_in_vectors--;
- }
-
- // Does the Row vector point inwards or outwards
- if (mb_col < cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
- sum_in_vectors--;
- else if (mv.as_mv.col < 0)
- sum_in_vectors++;
- } else if (mb_col > cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
- sum_in_vectors++;
- else if (mv.as_mv.col < 0)
- sum_in_vectors--;
- }
- }
- }
- } else
- sr_coded_error += (int64_t)this_error;
-
- coded_error += (int64_t)this_error;
-
- // adjust to the next column of macroblocks
- x->src.y_buffer += 16;
- x->src.u_buffer += 8;
- x->src.v_buffer += 8;
-
- recon_yoffset += 16;
- recon_uvoffset += 8;
- }
-
- // adjust to the next row of mbs
- x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
- x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
- x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-
- // extend the recon for intra prediction
- vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
- xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
- vp9_clear_system_state(); // __asm emms;
- }
-
- vp9_clear_system_state(); // __asm emms;
- {
- double weight = 0.0;
-
- FIRSTPASS_STATS fps;
-
- fps.frame = cm->current_video_frame;
- fps.intra_error = intra_error >> 8;
- fps.coded_error = coded_error >> 8;
- fps.sr_coded_error = sr_coded_error >> 8;
- weight = simple_weight(cpi->Source);
-
-
- if (weight < 0.1)
- weight = 0.1;
-
- fps.ssim_weighted_pred_err = fps.coded_error * weight;
-
- fps.pcnt_inter = 0.0;
- fps.pcnt_motion = 0.0;
- fps.MVr = 0.0;
- fps.mvr_abs = 0.0;
- fps.MVc = 0.0;
- fps.mvc_abs = 0.0;
- fps.MVrv = 0.0;
- fps.MVcv = 0.0;
- fps.mv_in_out_count = 0.0;
- fps.new_mv_count = 0.0;
- fps.count = 1.0;
-
- fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
- fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
- fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
-
- if (mvcount > 0) {
- fps.MVr = (double)sum_mvr / (double)mvcount;
- fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
- fps.MVc = (double)sum_mvc / (double)mvcount;
- fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
- fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
- fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
- fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
- fps.new_mv_count = new_mv_count;
-
- fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
- }
-
- // TODO: handle the case when duration is set to 0, or something less
- // than the full time between subsequent cpi->source_time_stamp s .
- fps.duration = cpi->source->ts_end
- - cpi->source->ts_start;
-
- // don't want to do output stats with a stack variable!
- memcpy(cpi->twopass.this_frame_stats,
- &fps,
- sizeof(FIRSTPASS_STATS));
- output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
- accumulate_stats(cpi->twopass.total_stats, &fps);
- }
-
- // Copy the previous Last Frame back into gf and and arf buffers if
- // the prediction is good enough... but also dont allow it to lag too far
- if ((cpi->twopass.sr_update_lag > 3) ||
- ((cm->current_video_frame > 0) &&
- (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
- ((cpi->twopass.this_frame_stats->intra_error /
- cpi->twopass.this_frame_stats->coded_error) > 2.0))) {
- vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
- cpi->twopass.sr_update_lag = 1;
- } else
- cpi->twopass.sr_update_lag++;
-
- // swap frame pointers so last frame refers to the frame we just compressed
- vp9_swap_yv12_buffer(lst_yv12, new_yv12);
- vp8_yv12_extend_frame_borders(lst_yv12);
-
- // Special case for the first frame. Copy into the GF buffer as a second reference.
- if (cm->current_video_frame == 0) {
- vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
- }
-
-
- // use this to see what the first pass reconstruction looks like
- if (0) {
- char filename[512];
- FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-
- if (cm->current_video_frame == 0)
- recon_file = fopen(filename, "wb");
- else
- recon_file = fopen(filename, "ab");
-
- if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
- fclose(recon_file);
- }
-
- cm->current_video_frame++;
-
-}
-
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
-static double bitcost(double prob) {
- return -(log(prob) / log(2.0));
-}
-
-static long long estimate_modemvcost(VP9_COMP *cpi,
- FIRSTPASS_STATS *fpstats) {
- int mv_cost;
- int mode_cost;
-
- double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
- double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
- double av_intra = (1.0 - av_pct_inter);
-
- double zz_cost;
- double motion_cost;
- double intra_cost;
-
- zz_cost = bitcost(av_pct_inter - av_pct_motion);
- motion_cost = bitcost(av_pct_motion);
- intra_cost = bitcost(av_intra);
-
- // Estimate of extra bits per mv overhead for mbs
- // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
- mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
-
- // Crude estimate of overhead cost from modes
- // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
- mode_cost =
- (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
- (av_pct_motion * motion_cost) +
- (av_intra * intra_cost)) * cpi->common.MBs) << 9;
-
- // return mv_cost + mode_cost;
- // TODO PGW Fix overhead costs for extended Q range
- return 0;
-}
-
-static double calc_correction_factor(double err_per_mb,
- double err_divisor,
- double pt_low,
- double pt_high,
- int Q) {
- double power_term;
- double error_term = err_per_mb / err_divisor;
- double correction_factor;
-
- // Adjustment based on actual quantizer to power term.
- power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
- power_term = (power_term > pt_high) ? pt_high : power_term;
-
- // Adjustments to error term
- // TBD
-
- // Calculate correction factor
- correction_factor = pow(error_term, power_term);
-
- // Clip range
- correction_factor =
- (correction_factor < 0.05)
- ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;
-
- return correction_factor;
-}
-
-// Given a current maxQ value sets a range for future values.
-// PGW TODO..
-// This code removes direct dependency on QIndex to determin the range
-// (now uses the actual quantizer) but has not been tuned.
-static void adjust_maxq_qrange(VP9_COMP *cpi) {
- int i;
- double q;
-
- // Set the max corresponding to cpi->avg_q * 2.0
- q = cpi->avg_q * 2.0;
- cpi->twopass.maxq_max_limit = cpi->worst_quality;
- for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
- cpi->twopass.maxq_max_limit = i;
- if (vp9_convert_qindex_to_q(i) >= q)
- break;
- }
-
- // Set the min corresponding to cpi->avg_q * 0.5
- q = cpi->avg_q * 0.5;
- cpi->twopass.maxq_min_limit = cpi->best_quality;
- for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
- cpi->twopass.maxq_min_limit = i;
- if (vp9_convert_qindex_to_q(i) <= q)
- break;
- }
-}
-
-static int estimate_max_q(VP9_COMP *cpi,
- FIRSTPASS_STATS *fpstats,
- int section_target_bandwitdh,
- int overhead_bits) {
- int Q;
- int num_mbs = cpi->common.MBs;
- int target_norm_bits_per_mb;
-
- double section_err = (fpstats->coded_error / fpstats->count);
- double sr_err_diff;
- double sr_correction;
- double err_per_mb = section_err / num_mbs;
- double err_correction_factor;
- double speed_correction = 1.0;
- int overhead_bits_per_mb;
-
- if (section_target_bandwitdh <= 0)
- return cpi->twopass.maxq_max_limit; // Highest value allowed
-
- target_norm_bits_per_mb =
- (section_target_bandwitdh < (1 << 20))
- ? (512 * section_target_bandwitdh) / num_mbs
- : 512 * (section_target_bandwitdh / num_mbs);
-
- // Look at the drop in prediction quality between the last frame
- // and the GF buffer (which contained an older frame).
- sr_err_diff =
- (fpstats->sr_coded_error - fpstats->coded_error) /
- (fpstats->count * cpi->common.MBs);
- sr_correction = (sr_err_diff / 32.0);
- sr_correction = pow(sr_correction, 0.25);
- if (sr_correction < 0.75)
- sr_correction = 0.75;
- else if (sr_correction > 1.25)
- sr_correction = 1.25;
-
- // Calculate a corrective factor based on a rolling ratio of bits spent
- // vs target bits
- if ((cpi->rolling_target_bits > 0) &&
- (cpi->active_worst_quality < cpi->worst_quality)) {
- double rolling_ratio;
-
- rolling_ratio = (double)cpi->rolling_actual_bits /
- (double)cpi->rolling_target_bits;
-
- if (rolling_ratio < 0.95)
- cpi->twopass.est_max_qcorrection_factor -= 0.005;
- else if (rolling_ratio > 1.05)
- cpi->twopass.est_max_qcorrection_factor += 0.005;
-
- cpi->twopass.est_max_qcorrection_factor =
- (cpi->twopass.est_max_qcorrection_factor < 0.1)
- ? 0.1
- : (cpi->twopass.est_max_qcorrection_factor > 10.0)
- ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
- }
-
- // Corrections for higher compression speed settings
- // (reduced compression expected)
- if (cpi->compressor_speed == 1) {
- if (cpi->oxcf.cpu_used <= 5)
- speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
- else
- speed_correction = 1.25;
- }
-
- // Estimate of overhead bits per mb
- // Correction to overhead bits for min allowed Q.
- // PGW TODO.. This code is broken for the extended Q range
- // for now overhead set to 0.
- overhead_bits_per_mb = overhead_bits / num_mbs;
- overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);
-
- // Try and pick a max Q that will be high enough to encode the
- // content at the given rate.
- for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
- int bits_per_mb_at_this_q;
-
- err_correction_factor =
- calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
- sr_correction * speed_correction *
- cpi->twopass.est_max_qcorrection_factor;
-
- if (err_correction_factor < 0.05)
- err_correction_factor = 0.05;
- else if (err_correction_factor > 5.0)
- err_correction_factor = 5.0;
-
- bits_per_mb_at_this_q =
- vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
-
- bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
- (double)bits_per_mb_at_this_q);
-
- // Mode and motion overhead
- // As Q rises in real encode loop rd code will force overhead down
- // We make a crude adjustment for this here as *.98 per Q step.
- // PGW TODO.. This code is broken for the extended Q range
- // for now overhead set to 0.
- // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
-
- if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
- break;
- }
-
- // Restriction on active max q for constrained quality mode.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (Q < cpi->cq_target_quality)) {
- Q = cpi->cq_target_quality;
- }
-
- // Adjust maxq_min_limit and maxq_max_limit limits based on
- // averaga q observed in clip for non kf/gf/arf frames
- // Give average a chance to settle though.
- // PGW TODO.. This code is broken for the extended Q range
- if ((cpi->ni_frames >
- ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
- (cpi->ni_frames > 150)) {
- adjust_maxq_qrange(cpi);
- }
-
- return Q;
-}
-
-// For cq mode estimate a cq level that matches the observed
-// complexity and data rate.
-static int estimate_cq(VP9_COMP *cpi,
- FIRSTPASS_STATS *fpstats,
- int section_target_bandwitdh,
- int overhead_bits) {
- int Q;
- int num_mbs = cpi->common.MBs;
- int target_norm_bits_per_mb;
-
- double section_err = (fpstats->coded_error / fpstats->count);
- double err_per_mb = section_err / num_mbs;
- double err_correction_factor;
- double sr_err_diff;
- double sr_correction;
- double speed_correction = 1.0;
- double clip_iiratio;
- double clip_iifactor;
- int overhead_bits_per_mb;
-
-
- target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
- ? (512 * section_target_bandwitdh) / num_mbs
- : 512 * (section_target_bandwitdh / num_mbs);
-
- // Estimate of overhead bits per mb
- overhead_bits_per_mb = overhead_bits / num_mbs;
-
- // Corrections for higher compression speed settings
- // (reduced compression expected)
- if (cpi->compressor_speed == 1) {
- if (cpi->oxcf.cpu_used <= 5)
- speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
- else
- speed_correction = 1.25;
- }
-
- // Look at the drop in prediction quality between the last frame
- // and the GF buffer (which contained an older frame).
- sr_err_diff =
- (fpstats->sr_coded_error - fpstats->coded_error) /
- (fpstats->count * cpi->common.MBs);
- sr_correction = (sr_err_diff / 32.0);
- sr_correction = pow(sr_correction, 0.25);
- if (sr_correction < 0.75)
- sr_correction = 0.75;
- else if (sr_correction > 1.25)
- sr_correction = 1.25;
-
- // II ratio correction factor for clip as a whole
- clip_iiratio = cpi->twopass.total_stats->intra_error /
- DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
- clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
- if (clip_iifactor < 0.80)
- clip_iifactor = 0.80;
-
- // Try and pick a Q that can encode the content at the given rate.
- for (Q = 0; Q < MAXQ; Q++) {
- int bits_per_mb_at_this_q;
-
- // Error per MB based correction factor
- err_correction_factor =
- calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
- sr_correction * speed_correction * clip_iifactor;
-
- if (err_correction_factor < 0.05)
- err_correction_factor = 0.05;
- else if (err_correction_factor > 5.0)
- err_correction_factor = 5.0;
-
- bits_per_mb_at_this_q =
- vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
-
- bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
- (double)bits_per_mb_at_this_q);
-
- // Mode and motion overhead
- // As Q rises in real encode loop rd code will force overhead down
- // We make a crude adjustment for this here as *.98 per Q step.
- // PGW TODO.. This code is broken for the extended Q range
- // for now overhead set to 0.
- overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
-
- if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
- break;
- }
-
- // Clip value to range "best allowed to (worst allowed - 1)"
- Q = select_cq_level(Q);
- if (Q >= cpi->worst_quality)
- Q = cpi->worst_quality - 1;
- if (Q < cpi->best_quality)
- Q = cpi->best_quality;
-
- return Q;
-}
-
-
-extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
-
-void vp9_init_second_pass(VP9_COMP *cpi) {
- FIRSTPASS_STATS this_frame;
- FIRSTPASS_STATS *start_pos;
-
- double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
- double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
-
- if (two_pass_min_rate < lower_bounds_min_rate)
- two_pass_min_rate = lower_bounds_min_rate;
-
- zero_stats(cpi->twopass.total_stats);
- zero_stats(cpi->twopass.total_left_stats);
-
- if (!cpi->twopass.stats_in_end)
- return;
-
- *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
- *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
-
- // each frame can have a different duration, as the frame rate in the source
- // isn't guaranteed to be constant. The frame rate prior to the first frame
- // encoded in the second pass is a guess. However the sum duration is not.
- // Its calculated based on the actual durations of all frames from the first
- // pass.
- vp9_new_frame_rate(cpi,
- 10000000.0 * cpi->twopass.total_stats->count /
- cpi->twopass.total_stats->duration);
-
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
- cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
- cpi->oxcf.target_bandwidth / 10000000.0);
- cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
- two_pass_min_rate / 10000000.0);
-
- // Calculate a minimum intra value to be used in determining the IIratio
- // scores used in the second pass. We have this minimum to make sure
- // that clips that are static but "low complexity" in the intra domain
- // are still boosted appropriately for KF/GF/ARF
- cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
- cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
-
- // This variable monitors how far behind the second ref update is lagging
- cpi->twopass.sr_update_lag = 1;
-
- // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
- {
- double sum_iiratio = 0.0;
- double IIRatio;
-
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
-
- while (input_stats(cpi, &this_frame) != EOF) {
- IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
- IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
- sum_iiratio += IIRatio;
- }
-
- cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
-
- // Reset file position
- reset_fpf_position(cpi, start_pos);
- }
-
- // Scan the first pass file and calculate a modified total error based upon the bias/power function
- // used to allocate bits
- {
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
-
- cpi->twopass.modified_error_total = 0.0;
- cpi->twopass.modified_error_used = 0.0;
-
- while (input_stats(cpi, &this_frame) != EOF) {
- cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
- }
- cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
-
- reset_fpf_position(cpi, start_pos); // Reset file position
-
- }
-}
-
-void vp9_end_second_pass(VP9_COMP *cpi) {
-}
-
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
-static double get_prediction_decay_rate(VP9_COMP *cpi,
- FIRSTPASS_STATS *next_frame) {
- double prediction_decay_rate;
- double second_ref_decay;
- double mb_sr_err_diff;
-
- // Initial basis is the % mbs inter coded
- prediction_decay_rate = next_frame->pcnt_inter;
-
- // Look at the observed drop in prediction quality between the last frame
- // and the GF buffer (which contains an older frame).
- mb_sr_err_diff =
- (next_frame->sr_coded_error - next_frame->coded_error) /
- (cpi->common.MBs);
- second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
- second_ref_decay = pow(second_ref_decay, 0.5);
- if (second_ref_decay < 0.85)
- second_ref_decay = 0.85;
- else if (second_ref_decay > 1.0)
- second_ref_decay = 1.0;
-
- if (second_ref_decay < prediction_decay_rate)
- prediction_decay_rate = second_ref_decay;
-
- return prediction_decay_rate;
-}
-
-// Function to test for a condition where a complex transition is followed
-// by a static section. For example in slide shows where there is a fade
-// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(
- VP9_COMP *cpi,
- int frame_interval,
- int still_interval,
- double loop_decay_rate,
- double last_decay_rate) {
- BOOL trans_to_still = FALSE;
-
- // Break clause to detect very still sections after motion
- // For example a static image after a fade or other transition
- // instead of a clean scene cut.
- if ((frame_interval > MIN_GF_INTERVAL) &&
- (loop_decay_rate >= 0.999) &&
- (last_decay_rate < 0.9)) {
- int j;
- FIRSTPASS_STATS *position = cpi->twopass.stats_in;
- FIRSTPASS_STATS tmp_next_frame;
- double zz_inter;
-
- // Look ahead a few frames to see if static condition
- // persists...
- for (j = 0; j < still_interval; j++) {
- if (EOF == input_stats(cpi, &tmp_next_frame))
- break;
-
- zz_inter =
- (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
- if (zz_inter < 0.999)
- break;
- }
- // Reset file position
- reset_fpf_position(cpi, position);
-
- // Only if it does do we signal a transition to still
- if (j == still_interval)
- trans_to_still = TRUE;
- }
-
- return trans_to_still;
-}
-
-// This function detects a flash through the high relative pcnt_second_ref
-// score in the frame following a flash frame. The offset passed in should
-// reflect this
-static BOOL detect_flash(VP9_COMP *cpi, int offset) {
- FIRSTPASS_STATS next_frame;
-
- BOOL flash_detected = FALSE;
-
- // Read the frame data.
- // The return is FALSE (no flash detected) if not a valid frame
- if (read_frame_stats(cpi, &next_frame, offset) != EOF) {
- // What we are looking for here is a situation where there is a
- // brief break in prediction (such as a flash) but subsequent frames
- // are reasonably well predicted by an earlier (pre flash) frame.
- // The recovery after a flash is indicated by a high pcnt_second_ref
- // comapred to pcnt_inter.
- if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
- (next_frame.pcnt_second_ref >= 0.5)) {
- flash_detected = TRUE;
- }
- }
-
- return flash_detected;
-}
-
-// Update the motion related elements to the GF arf boost calculation
-static void accumulate_frame_motion_stats(
- VP9_COMP *cpi,
- FIRSTPASS_STATS *this_frame,
- double *this_frame_mv_in_out,
- double *mv_in_out_accumulator,
- double *abs_mv_in_out_accumulator,
- double *mv_ratio_accumulator) {
- // double this_frame_mv_in_out;
- double this_frame_mvr_ratio;
- double this_frame_mvc_ratio;
- double motion_pct;
-
- // Accumulate motion stats.
- motion_pct = this_frame->pcnt_motion;
-
- // Accumulate Motion In/Out of frame stats
- *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
- *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
- *abs_mv_in_out_accumulator +=
- fabs(this_frame->mv_in_out_count * motion_pct);
-
- // Accumulate a measure of how uniform (or conversely how random)
- // the motion field is. (A ratio of absmv / mv)
- if (motion_pct > 0.05) {
- this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
- DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
-
- this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
- DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
-
- *mv_ratio_accumulator +=
- (this_frame_mvr_ratio < this_frame->mvr_abs)
- ? (this_frame_mvr_ratio * motion_pct)
- : this_frame->mvr_abs * motion_pct;
-
- *mv_ratio_accumulator +=
- (this_frame_mvc_ratio < this_frame->mvc_abs)
- ? (this_frame_mvc_ratio * motion_pct)
- : this_frame->mvc_abs * motion_pct;
-
- }
-}
-
-// Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(
- VP9_COMP *cpi,
- FIRSTPASS_STATS *this_frame,
- double this_frame_mv_in_out) {
- double frame_boost;
-
- // Underlying boost factor is based on inter intra error ratio
- if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
- frame_boost = (IIFACTOR * this_frame->intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
- else
- frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
-
- // Increase boost for frames where new data coming into frame
- // (eg zoom out). Slightly reduce boost if there is a net balance
- // of motion out of the frame (zoom in).
- // The range for this_frame_mv_in_out is -1.0 to +1.0
- if (this_frame_mv_in_out > 0.0)
- frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
- // In extreme case boost is halved
- else
- frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
-
- // Clip to maximum
- if (frame_boost > GF_RMAX)
- frame_boost = GF_RMAX;
-
- return frame_boost;
-}
-
-static int calc_arf_boost(
- VP9_COMP *cpi,
- int offset,
- int f_frames,
- int b_frames,
- int *f_boost,
- int *b_boost) {
- FIRSTPASS_STATS this_frame;
-
- int i;
- double boost_score = 0.0;
- double mv_ratio_accumulator = 0.0;
- double decay_accumulator = 1.0;
- double this_frame_mv_in_out = 0.0;
- double mv_in_out_accumulator = 0.0;
- double abs_mv_in_out_accumulator = 0.0;
- int arf_boost;
- BOOL flash_detected = FALSE;
-
- // Search forward from the proposed arf/next gf position
- for (i = 0; i < f_frames; i++) {
- if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
- break;
-
- // Update the motion related elements to the boost calculation
- accumulate_frame_motion_stats(cpi, &this_frame,
- &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
- // We want to discount the the flash frame itself and the recovery
- // frame that follows as both will have poor scores.
- flash_detected = detect_flash(cpi, (i + offset)) ||
- detect_flash(cpi, (i + offset + 1));
-
- // Cumulative effect of prediction quality decay
- if (!flash_detected) {
- decay_accumulator =
- decay_accumulator *
- get_prediction_decay_rate(cpi, &this_frame);
- decay_accumulator =
- decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
- }
-
- boost_score += (decay_accumulator *
- calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
- }
-
- *f_boost = boost_score;
-
- // Reset for backward looking loop
- boost_score = 0.0;
- mv_ratio_accumulator = 0.0;
- decay_accumulator = 1.0;
- this_frame_mv_in_out = 0.0;
- mv_in_out_accumulator = 0.0;
- abs_mv_in_out_accumulator = 0.0;
-
- // Search backward towards last gf position
- for (i = -1; i >= -b_frames; i--) {
- if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
- break;
-
- // Update the motion related elements to the boost calculation
- accumulate_frame_motion_stats(cpi, &this_frame,
- &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
- // We want to discount the the flash frame itself and the recovery
- // frame that follows as both will have poor scores.
- flash_detected = detect_flash(cpi, (i + offset)) ||
- detect_flash(cpi, (i + offset + 1));
-
- // Cumulative effect of prediction quality decay
- if (!flash_detected) {
- decay_accumulator =
- decay_accumulator *
- get_prediction_decay_rate(cpi, &this_frame);
- decay_accumulator =
- decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
- }
-
- boost_score += (decay_accumulator *
- calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
- }
- *b_boost = boost_score;
-
- arf_boost = (*f_boost + *b_boost);
- if (arf_boost < ((b_frames + f_frames) * 20))
- arf_boost = ((b_frames + f_frames) * 20);
-
- return arf_boost;
-}
-
-static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- int half_gf_int;
- int frames_after_arf;
- int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
- int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
-
- // Define the arnr filter width for this group of frames:
- // We only filter frames that lie within a distance of half
- // the GF interval from the ARF frame. We also have to trap
- // cases where the filter extends beyond the end of clip.
- // Note: this_frame->frame has been updated in the loop
- // so it now points at the ARF frame.
- half_gf_int = cpi->baseline_gf_interval >> 1;
- frames_after_arf = cpi->twopass.total_stats->count -
- this_frame->frame - 1;
-
- switch (cpi->oxcf.arnr_type) {
- case 1: // Backward filter
- frames_fwd = 0;
- if (frames_bwd > half_gf_int)
- frames_bwd = half_gf_int;
- break;
-
- case 2: // Forward filter
- if (frames_fwd > half_gf_int)
- frames_fwd = half_gf_int;
- if (frames_fwd > frames_after_arf)
- frames_fwd = frames_after_arf;
- frames_bwd = 0;
- break;
-
- case 3: // Centered filter
- default:
- frames_fwd >>= 1;
- if (frames_fwd > frames_after_arf)
- frames_fwd = frames_after_arf;
- if (frames_fwd > half_gf_int)
- frames_fwd = half_gf_int;
-
- frames_bwd = frames_fwd;
-
- // For even length filter there is one more frame backward
- // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
- if (frames_bwd < half_gf_int)
- frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
- break;
- }
-
- cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-}
-
-// Analyse and define a gf/arf group .
-static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- FIRSTPASS_STATS next_frame;
- FIRSTPASS_STATS *start_pos;
- int i;
- double boost_score = 0.0;
- double old_boost_score = 0.0;
- double gf_group_err = 0.0;
- double gf_first_frame_err = 0.0;
- double mod_frame_err = 0.0;
-
- double mv_ratio_accumulator = 0.0;
- double decay_accumulator = 1.0;
- double zero_motion_accumulator = 1.0;
-
- double loop_decay_rate = 1.00; // Starting decay rate
- double last_loop_decay_rate = 1.00;
-
- double this_frame_mv_in_out = 0.0;
- double mv_in_out_accumulator = 0.0;
- double abs_mv_in_out_accumulator = 0.0;
-
- int max_bits = frame_max_bits(cpi); // Max for a single frame
-
- unsigned int allow_alt_ref =
- cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
-
- int f_boost = 0;
- int b_boost = 0;
- BOOL flash_detected;
-
- cpi->twopass.gf_group_bits = 0;
-
- vp9_clear_system_state(); // __asm emms;
-
- start_pos = cpi->twopass.stats_in;
-
- vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
- // Load stats for the current frame.
- mod_frame_err = calculate_modified_err(cpi, this_frame);
-
- // Note the error of the frame at the start of the group (this will be
- // the GF frame error if we code a normal gf
- gf_first_frame_err = mod_frame_err;
-
- // Special treatment if the current frame is a key frame (which is also
- // a gf). If it is then its error score (and hence bit allocation) need
- // to be subtracted out from the calculation for the GF group
- if (cpi->common.frame_type == KEY_FRAME)
- gf_group_err -= gf_first_frame_err;
-
- // Scan forward to try and work out how many frames the next gf group
- // should contain and what level of boost is appropriate for the GF
- // or ARF that will be coded with the group
- i = 0;
-
- while (((i < cpi->twopass.static_scene_max_gf_interval) ||
- ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
- (i < cpi->twopass.frames_to_key)) {
- i++; // Increment the loop counter
-
- // Accumulate error score of frames in this gf group
- mod_frame_err = calculate_modified_err(cpi, this_frame);
- gf_group_err += mod_frame_err;
-
- if (EOF == input_stats(cpi, &next_frame))
- break;
-
- // Test for the case where there is a brief flash but the prediction
- // quality back to an earlier frame is then restored.
- flash_detected = detect_flash(cpi, 0);
-
- // Update the motion related elements to the boost calculation
- accumulate_frame_motion_stats(cpi, &next_frame,
- &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
- // Cumulative effect of prediction quality decay
- if (!flash_detected) {
- last_loop_decay_rate = loop_decay_rate;
- loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
- decay_accumulator = decay_accumulator * loop_decay_rate;
-
- // Monitor for static sections.
- if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
- zero_motion_accumulator) {
- zero_motion_accumulator =
- (next_frame.pcnt_inter - next_frame.pcnt_motion);
- }
-
- // Break clause to detect very still sections after motion
- // (for example a staic image after a fade or other transition).
- if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
- last_loop_decay_rate)) {
- allow_alt_ref = FALSE;
- break;
- }
- }
-
- // Calculate a boost number for this frame
- boost_score +=
- (decay_accumulator *
- calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
-
- // Break out conditions.
- if (
- // Break at cpi->max_gf_interval unless almost totally static
- (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||
- (
- // Dont break out with a very short interval
- (i > MIN_GF_INTERVAL) &&
- // Dont break out very close to a key frame
- ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
- ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
- (!flash_detected) &&
- ((mv_ratio_accumulator > 100.0) ||
- (abs_mv_in_out_accumulator > 3.0) ||
- (mv_in_out_accumulator < -2.0) ||
- ((boost_score - old_boost_score) < 12.5))
- )) {
- boost_score = old_boost_score;
- break;
- }
-
- vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
-
- old_boost_score = boost_score;
- }
-
- // Dont allow a gf too near the next kf
- if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
- while (i < cpi->twopass.frames_to_key) {
- i++;
-
- if (EOF == input_stats(cpi, this_frame))
- break;
-
- if (i < cpi->twopass.frames_to_key) {
- mod_frame_err = calculate_modified_err(cpi, this_frame);
- gf_group_err += mod_frame_err;
- }
- }
- }
-
- // Set the interval till the next gf or arf.
- cpi->baseline_gf_interval = i;
-
- // Should we use the alternate refernce frame
- if (allow_alt_ref &&
- (i < cpi->oxcf.lag_in_frames) &&
- (i >= MIN_GF_INTERVAL) &&
- // dont use ARF very near next kf
- (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
- ((next_frame.pcnt_inter > 0.75) ||
- (next_frame.pcnt_second_ref > 0.5)) &&
- ((mv_in_out_accumulator / (double)i > -0.2) ||
- (mv_in_out_accumulator > -2.0)) &&
- (boost_score > 100)) {
- // Alterrnative boost calculation for alt ref
- cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
- cpi->source_alt_ref_pending = TRUE;
-
- configure_arnr_filter(cpi, this_frame);
- } else {
- cpi->gfu_boost = (int)boost_score;
- cpi->source_alt_ref_pending = FALSE;
- }
-
- // Now decide how many bits should be allocated to the GF group as a
- // proportion of those remaining in the kf group.
- // The final key frame group in the clip is treated as a special case
- // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
- // This is also important for short clips where there may only be one
- // key frame.
- if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
- cpi->common.current_video_frame)) {
- cpi->twopass.kf_group_bits =
- (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
- }
-
- // Calculate the bits to be allocated to the group as a whole
- if ((cpi->twopass.kf_group_bits > 0) &&
- (cpi->twopass.kf_group_error_left > 0)) {
- cpi->twopass.gf_group_bits =
- (int)((double)cpi->twopass.kf_group_bits *
- (gf_group_err / (double)cpi->twopass.kf_group_error_left));
- } else
- cpi->twopass.gf_group_bits = 0;
-
- cpi->twopass.gf_group_bits =
- (cpi->twopass.gf_group_bits < 0)
- ? 0
- : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
- ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
-
- // Clip cpi->twopass.gf_group_bits based on user supplied data rate
- // variability limit (cpi->oxcf.two_pass_vbrmax_section)
- if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
- cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
-
- // Reset the file position
- reset_fpf_position(cpi, start_pos);
-
- // Update the record of error used so far (only done once per gf group)
- cpi->twopass.modified_error_used += gf_group_err;
-
- // Assign bits to the arf or gf.
- for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
- int boost;
- int allocation_chunks;
- int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
- int gf_bits;
-
- boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
-
- // Set max and minimum boost and hence minimum allocation
- if (boost > ((cpi->baseline_gf_interval + 1) * 200))
- boost = ((cpi->baseline_gf_interval + 1) * 200);
- else if (boost < 125)
- boost = 125;
-
- if (cpi->source_alt_ref_pending && i == 0)
- allocation_chunks =
- ((cpi->baseline_gf_interval + 1) * 100) + boost;
- else
- allocation_chunks =
- (cpi->baseline_gf_interval * 100) + (boost - 100);
-
- // Prevent overflow
- if (boost > 1028) {
- int divisor = boost >> 10;
- boost /= divisor;
- allocation_chunks /= divisor;
- }
-
- // Calculate the number of bits to be spent on the gf or arf based on
- // the boost number
- gf_bits = (int)((double)boost *
- (cpi->twopass.gf_group_bits /
- (double)allocation_chunks));
-
- // If the frame that is to be boosted is simpler than the average for
- // the gf/arf group then use an alternative calculation
- // based on the error score of the frame itself
- if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
- double alt_gf_grp_bits;
- int alt_gf_bits;
-
- alt_gf_grp_bits =
- (double)cpi->twopass.kf_group_bits *
- (mod_frame_err * (double)cpi->baseline_gf_interval) /
- DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
-
- alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
- (double)allocation_chunks));
-
- if (gf_bits > alt_gf_bits) {
- gf_bits = alt_gf_bits;
- }
- }
- // Else if it is harder than other frames in the group make sure it at
- // least receives an allocation in keeping with its relative error
- // score, otherwise it may be worse off than an "un-boosted" frame
- else {
- int alt_gf_bits =
- (int)((double)cpi->twopass.kf_group_bits *
- mod_frame_err /
- DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
-
- if (alt_gf_bits > gf_bits) {
- gf_bits = alt_gf_bits;
- }
- }
-
- // Dont allow a negative value for gf_bits
- if (gf_bits < 0)
- gf_bits = 0;
-
- gf_bits += cpi->min_frame_bandwidth; // Add in minimum for a frame
-
- if (i == 0) {
- cpi->twopass.gf_bits = gf_bits;
- }
- if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {
- cpi->per_frame_bandwidth = gf_bits; // Per frame bit target for this frame
- }
- }
-
- {
- // Adjust KF group bits and error remainin
- cpi->twopass.kf_group_error_left -= gf_group_err;
- cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
-
- if (cpi->twopass.kf_group_bits < 0)
- cpi->twopass.kf_group_bits = 0;
-
- // Note the error score left in the remaining frames of the group.
- // For normal GFs we want to remove the error score for the first frame
- // of the group (except in Key frame case where this has already
- // happened)
- if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
- cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;
- else
- cpi->twopass.gf_group_error_left = gf_group_err;
-
- cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
-
- if (cpi->twopass.gf_group_bits < 0)
- cpi->twopass.gf_group_bits = 0;
-
- // This condition could fail if there are two kfs very close together
- // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
- // calculation of cpi->twopass.alt_extra_bits.
- if (cpi->baseline_gf_interval >= 3) {
- int boost = (cpi->source_alt_ref_pending)
- ? b_boost : cpi->gfu_boost;
-
- if (boost >= 150) {
- int pct_extra;
-
- pct_extra = (boost - 100) / 50;
- pct_extra = (pct_extra > 20) ? 20 : pct_extra;
-
- cpi->twopass.alt_extra_bits =
- (cpi->twopass.gf_group_bits * pct_extra) / 100;
- cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
- cpi->twopass.alt_extra_bits /=
- ((cpi->baseline_gf_interval - 1) >> 1);
- } else
- cpi->twopass.alt_extra_bits = 0;
- } else
- cpi->twopass.alt_extra_bits = 0;
- }
-
- if (cpi->common.frame_type != KEY_FRAME) {
- FIRSTPASS_STATS sectionstats;
-
- zero_stats(§ionstats);
- reset_fpf_position(cpi, start_pos);
-
- for (i = 0; i < cpi->baseline_gf_interval; i++) {
- input_stats(cpi, &next_frame);
- accumulate_stats(§ionstats, &next_frame);
- }
-
- avg_stats(§ionstats);
-
- cpi->twopass.section_intra_rating =
- sectionstats.intra_error /
- DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
-
- reset_fpf_position(cpi, start_pos);
- }
-}
-
-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
-static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- int target_frame_size; // gf_group_error_left
-
- double modified_err;
- double err_fraction; // What portion of the remaining GF group error is used by this frame
-
- int max_bits = frame_max_bits(cpi); // Max for a single frame
-
- // Calculate modified prediction error used in bit allocation
- modified_err = calculate_modified_err(cpi, this_frame);
-
- if (cpi->twopass.gf_group_error_left > 0)
- err_fraction = modified_err / cpi->twopass.gf_group_error_left; // What portion of the remaining GF group error is used by this frame
- else
- err_fraction = 0.0;
-
- target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction); // How many of those bits available for allocation should we give it?
-
- // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
- if (target_frame_size < 0)
- target_frame_size = 0;
- else {
- if (target_frame_size > max_bits)
- target_frame_size = max_bits;
-
- if (target_frame_size > cpi->twopass.gf_group_bits)
- target_frame_size = cpi->twopass.gf_group_bits;
- }
-
- cpi->twopass.gf_group_error_left -= modified_err; // Adjust error remaining
- cpi->twopass.gf_group_bits -= target_frame_size; // Adjust bits remaining
-
- if (cpi->twopass.gf_group_bits < 0)
- cpi->twopass.gf_group_bits = 0;
-
- target_frame_size += cpi->min_frame_bandwidth; // Add in the minimum number of bits that is set aside for every frame.
-
-
- cpi->per_frame_bandwidth = target_frame_size; // Per frame bit target for this frame
-}
-
-// Make a damped adjustment to the active max q.
-static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
- int i;
- int ret_val = new_maxqi;
- double old_q;
- double new_q;
- double target_q;
-
- old_q = vp9_convert_qindex_to_q(old_maxqi);
- new_q = vp9_convert_qindex_to_q(new_maxqi);
-
- target_q = ((old_q * 7.0) + new_q) / 8.0;
-
- if (target_q > old_q) {
- for (i = old_maxqi; i <= new_maxqi; i++) {
- if (vp9_convert_qindex_to_q(i) >= target_q) {
- ret_val = i;
- break;
- }
- }
- } else {
- for (i = old_maxqi; i >= new_maxqi; i--) {
- if (vp9_convert_qindex_to_q(i) <= target_q) {
- ret_val = i;
- break;
- }
- }
- }
-
- return ret_val;
-}
-
-void vp9_second_pass(VP9_COMP *cpi) {
- int tmp_q;
- int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
-
- FIRSTPASS_STATS this_frame;
- FIRSTPASS_STATS this_frame_copy;
-
- double this_frame_error;
- double this_frame_intra_error;
- double this_frame_coded_error;
-
- FIRSTPASS_STATS *start_pos;
-
- int overhead_bits;
-
- if (!cpi->twopass.stats_in) {
- return;
- }
-
- vp9_clear_system_state();
-
- vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));
-
- if (EOF == input_stats(cpi, &this_frame))
- return;
-
- this_frame_error = this_frame.ssim_weighted_pred_err;
- this_frame_intra_error = this_frame.intra_error;
- this_frame_coded_error = this_frame.coded_error;
-
- start_pos = cpi->twopass.stats_in;
-
- // keyframe and section processing !
- if (cpi->twopass.frames_to_key == 0) {
- // Define next KF group and assign bits to it
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
- find_next_key_frame(cpi, &this_frame_copy);
- }
-
- // Is this a GF / ARF (Note that a KF is always also a GF)
- if (cpi->frames_till_gf_update_due == 0) {
- // Define next gf group and assign bits to it
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
- define_gf_group(cpi, &this_frame_copy);
-
- // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
- // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
- // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
- if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
- // Assign a standard frames worth of bits from those allocated to the GF group
- int bak = cpi->per_frame_bandwidth;
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
- assign_std_frame_bits(cpi, &this_frame_copy);
- cpi->per_frame_bandwidth = bak;
- }
- }
-
- // Otherwise this is an ordinary frame
- else {
- // Assign bits from those allocated to the GF group
- vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
- assign_std_frame_bits(cpi, &this_frame_copy);
- }
-
- // Keep a globally available copy of this and the next frame's iiratio.
- cpi->twopass.this_iiratio = this_frame_intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame_coded_error);
- {
- FIRSTPASS_STATS next_frame;
- if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {
- cpi->twopass.next_iiratio = next_frame.intra_error /
- DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
- }
- }
-
- // Set nominal per second bandwidth for this frame
- cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
- if (cpi->target_bandwidth < 0)
- cpi->target_bandwidth = 0;
-
-
- // Account for mv, mode and other overheads.
- overhead_bits = estimate_modemvcost(
- cpi, cpi->twopass.total_left_stats);
-
- // Special case code for first frame.
- if (cpi->common.current_video_frame == 0) {
- cpi->twopass.est_max_qcorrection_factor = 1.0;
-
- // Set a cq_level in constrained quality mode.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- int est_cq;
-
- est_cq =
- estimate_cq(cpi,
- cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left),
- overhead_bits);
-
- cpi->cq_target_quality = cpi->oxcf.cq_level;
- if (est_cq > cpi->cq_target_quality)
- cpi->cq_target_quality = est_cq;
- }
-
- // guess at maxq needed in 2nd pass
- cpi->twopass.maxq_max_limit = cpi->worst_quality;
- cpi->twopass.maxq_min_limit = cpi->best_quality;
-
- tmp_q = estimate_max_q(
- cpi,
- cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left),
- overhead_bits);
-
- cpi->active_worst_quality = tmp_q;
- cpi->ni_av_qi = tmp_q;
- cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
- // Limit the maxq value returned subsequently.
- // This increases the risk of overspend or underspend if the initial
- // estimate for the clip is bad, but helps prevent excessive
- // variation in Q, especially near the end of a clip
- // where for example a small overspend may cause Q to crash
- adjust_maxq_qrange(cpi);
- }
-
- // The last few frames of a clip almost always have to few or too many
- // bits and for the sake of over exact rate control we dont want to make
- // radical adjustments to the allowed quantizer range just to use up a
- // few surplus bits or get beneath the target rate.
- else if ((cpi->common.current_video_frame <
- (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
- ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
- (unsigned int)cpi->twopass.total_stats->count)) {
- if (frames_left < 1)
- frames_left = 1;
-
- tmp_q = estimate_max_q(
- cpi,
- cpi->twopass.total_left_stats,
- (int)(cpi->twopass.bits_left / frames_left),
- overhead_bits);
-
- // Make a damped adjustment to active max Q
- cpi->active_worst_quality =
- adjust_active_maxq(cpi->active_worst_quality, tmp_q);
- }
-
- cpi->twopass.frames_to_key--;
-
- // Update the total stats remaining sturcture
- subtract_stats(cpi->twopass.total_left_stats, &this_frame);
-}
-
-
-static BOOL test_candidate_kf(VP9_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {
- BOOL is_viable_kf = FALSE;
-
- // Does the frame satisfy the primary criteria of a key frame
- // If so, then examine how well it predicts subsequent frames
- if ((this_frame->pcnt_second_ref < 0.10) &&
- (next_frame->pcnt_second_ref < 0.10) &&
- ((this_frame->pcnt_inter < 0.05) ||
- (
- ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
- ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
- ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
- (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
- ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
- )
- )
- )
- ) {
- int i;
- FIRSTPASS_STATS *start_pos;
-
- FIRSTPASS_STATS local_next_frame;
-
- double boost_score = 0.0;
- double old_boost_score = 0.0;
- double decay_accumulator = 1.0;
- double next_iiratio;
-
- vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
-
- // Note the starting file position so we can reset to it
- start_pos = cpi->twopass.stats_in;
-
- // Examine how well the key frame predicts subsequent frames
- for (i = 0; i < 16; i++) {
- next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
-
- if (next_iiratio > RMAX)
- next_iiratio = RMAX;
-
- // Cumulative effect of decay in prediction quality
- if (local_next_frame.pcnt_inter > 0.85)
- decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
- else
- decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
-
- // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
- // Keep a running total
- boost_score += (decay_accumulator * next_iiratio);
-
- // Test various breakout clauses
- if ((local_next_frame.pcnt_inter < 0.05) ||
- (next_iiratio < 1.5) ||
- (((local_next_frame.pcnt_inter -
- local_next_frame.pcnt_neutral) < 0.20) &&
- (next_iiratio < 3.0)) ||
- ((boost_score - old_boost_score) < 3.0) ||
- (local_next_frame.intra_error < 200)
- ) {
- break;
- }
-
- old_boost_score = boost_score;
-
- // Get the next frame details
- if (EOF == input_stats(cpi, &local_next_frame))
- break;
- }
-
- // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
- if (boost_score > 30.0 && (i > 3))
- is_viable_kf = TRUE;
- else {
- // Reset the file position
- reset_fpf_position(cpi, start_pos);
-
- is_viable_kf = FALSE;
- }
- }
-
- return is_viable_kf;
-}
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- int i, j;
- FIRSTPASS_STATS last_frame;
- FIRSTPASS_STATS first_frame;
- FIRSTPASS_STATS next_frame;
- FIRSTPASS_STATS *start_position;
-
- double decay_accumulator = 1.0;
- double zero_motion_accumulator = 1.0;
- double boost_score = 0;
- double old_boost_score = 0.0;
- double loop_decay_rate;
-
- double kf_mod_err = 0.0;
- double kf_group_err = 0.0;
- double kf_group_intra_err = 0.0;
- double kf_group_coded_err = 0.0;
- double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
- vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
-
- vp9_clear_system_state(); // __asm emms;
- start_position = cpi->twopass.stats_in;
-
- cpi->common.frame_type = KEY_FRAME;
-
- // is this a forced key frame by interval
- cpi->this_key_frame_forced = cpi->next_key_frame_forced;
-
- // Clear the alt ref active flag as this can never be active on a key frame
- cpi->source_alt_ref_active = FALSE;
-
- // Kf is always a gf so clear frames till next gf counter
- cpi->frames_till_gf_update_due = 0;
-
- cpi->twopass.frames_to_key = 1;
-
- // Take a copy of the initial frame details
- vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
-
- cpi->twopass.kf_group_bits = 0; // Total bits avaialable to kf group
- cpi->twopass.kf_group_error_left = 0; // Group modified error score.
-
- kf_mod_err = calculate_modified_err(cpi, this_frame);
-
- // find the next keyframe
- i = 0;
- while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
- // Accumulate kf group error
- kf_group_err += calculate_modified_err(cpi, this_frame);
-
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
- kf_group_intra_err += this_frame->intra_error;
- kf_group_coded_err += this_frame->coded_error;
-
- // load a the next frame's stats
- vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
- input_stats(cpi, this_frame);
-
- // Provided that we are not at the end of the file...
- if (cpi->oxcf.auto_key
- && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
- // Normal scene cut check
- if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
- break;
- }
-
- // How fast is prediction quality decaying
- loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
- // We want to know something about the recent past... rather than
- // as used elsewhere where we are concened with decay in prediction
- // quality since the last GF or KF.
- recent_loop_decay[i % 8] = loop_decay_rate;
- decay_accumulator = 1.0;
- for (j = 0; j < 8; j++) {
- decay_accumulator = decay_accumulator * recent_loop_decay[j];
- }
-
- // Special check for transition or high motion followed by a
- // to a static scene.
- if (detect_transition_to_still(cpi, i,
- (cpi->key_frame_frequency - i),
- loop_decay_rate,
- decay_accumulator)) {
- break;
- }
-
-
- // Step on to the next frame
- cpi->twopass.frames_to_key++;
-
- // If we don't have a real key frame within the next two
- // forcekeyframeevery intervals then break out of the loop.
- if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
- break;
- } else
- cpi->twopass.frames_to_key++;
-
- i++;
- }
-
- // If there is a max kf interval set by the user we must obey it.
- // We already breakout of the loop above at 2x max.
- // This code centers the extra kf if the actual natural
- // interval is between 1x and 2x
- if (cpi->oxcf.auto_key
- && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {
- FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
- FIRSTPASS_STATS tmp_frame;
-
- cpi->twopass.frames_to_key /= 2;
-
- // Copy first frame details
- vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
-
- // Reset to the start of the group
- reset_fpf_position(cpi, start_position);
-
- kf_group_err = 0;
- kf_group_intra_err = 0;
- kf_group_coded_err = 0;
-
- // Rescan to get the correct error data for the forced kf group
- for (i = 0; i < cpi->twopass.frames_to_key; i++) {
- // Accumulate kf group errors
- kf_group_err += calculate_modified_err(cpi, &tmp_frame);
- kf_group_intra_err += tmp_frame.intra_error;
- kf_group_coded_err += tmp_frame.coded_error;
-
- // Load a the next frame's stats
- input_stats(cpi, &tmp_frame);
- }
-
- // Reset to the start of the group
- reset_fpf_position(cpi, current_pos);
-
- cpi->next_key_frame_forced = TRUE;
- } else
- cpi->next_key_frame_forced = FALSE;
-
- // Special case for the last frame of the file
- if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
- // Accumulate kf group error
- kf_group_err += calculate_modified_err(cpi, this_frame);
-
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
- kf_group_intra_err += this_frame->intra_error;
- kf_group_coded_err += this_frame->coded_error;
- }
-
- // Calculate the number of bits that should be assigned to the kf group.
- if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
- // Max for a single normal frame (not key frame)
- int max_bits = frame_max_bits(cpi);
-
- // Maximum bits for the kf group
- int64_t max_grp_bits;
-
- // Default allocation based on bits left and relative
- // complexity of the section
- cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
- (kf_group_err /
- cpi->twopass.modified_error_left));
-
- // Clip based on maximum per frame rate defined by the user.
- max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
- if (cpi->twopass.kf_group_bits > max_grp_bits)
- cpi->twopass.kf_group_bits = max_grp_bits;
- } else
- cpi->twopass.kf_group_bits = 0;
-
- // Reset the first pass file position
- reset_fpf_position(cpi, start_position);
-
- // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
- decay_accumulator = 1.0;
- boost_score = 0.0;
- loop_decay_rate = 1.00; // Starting decay rate
-
- for (i = 0; i < cpi->twopass.frames_to_key; i++) {
- double r;
-
- if (EOF == input_stats(cpi, &next_frame))
- break;
-
- if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
- r = (IIKFACTOR2 * next_frame.intra_error /
- DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
- else
- r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
- DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-
- if (r > RMAX)
- r = RMAX;
-
- // Monitor for static sections.
- if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
- zero_motion_accumulator) {
- zero_motion_accumulator =
- (next_frame.pcnt_inter - next_frame.pcnt_motion);
- }
-
- // How fast is prediction quality decaying
- if (!detect_flash(cpi, 0)) {
- loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
- decay_accumulator = decay_accumulator * loop_decay_rate;
- decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
- }
-
- boost_score += (decay_accumulator * r);
-
- if ((i > MIN_GF_INTERVAL) &&
- ((boost_score - old_boost_score) < 6.25)) {
- break;
- }
-
- old_boost_score = boost_score;
- }
-
- {
- FIRSTPASS_STATS sectionstats;
-
- zero_stats(§ionstats);
- reset_fpf_position(cpi, start_position);
-
- for (i = 0; i < cpi->twopass.frames_to_key; i++) {
- input_stats(cpi, &next_frame);
- accumulate_stats(§ionstats, &next_frame);
- }
-
- avg_stats(§ionstats);
-
- cpi->twopass.section_intra_rating =
- sectionstats.intra_error
- / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
- }
-
- // Reset the first pass file position
- reset_fpf_position(cpi, start_position);
-
- // Work out how many bits to allocate for the key frame itself
- if (1) {
- int kf_boost = boost_score;
- int allocation_chunks;
- int alt_kf_bits;
-
- if (kf_boost < 300) {
- kf_boost += (cpi->twopass.frames_to_key * 3);
- if (kf_boost > 300)
- kf_boost = 300;
- }
-
- if (kf_boost < 250) // Min KF boost
- kf_boost = 250;
-
- // Make a note of baseline boost and the zero motion
- // accumulator value for use elsewhere.
- cpi->kf_boost = kf_boost;
- cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
-
- // We do three calculations for kf size.
- // The first is based on the error score for the whole kf group.
- // The second (optionaly) on the key frames own error if this is
- // smaller than the average for the group.
- // The final one insures that the frame receives at least the
- // allocation it would have received based on its own error score vs
- // the error score remaining
- // Special case if the sequence appears almost totaly static
- // In this case we want to spend almost all of the bits on the
- // key frame.
- // cpi->twopass.frames_to_key-1 because key frame itself is taken
- // care of by kf_boost.
- if (zero_motion_accumulator >= 0.99) {
- allocation_chunks =
- ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
- } else {
- allocation_chunks =
- ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
- }
-
- // Prevent overflow
- if (kf_boost > 1028) {
- int divisor = kf_boost >> 10;
- kf_boost /= divisor;
- allocation_chunks /= divisor;
- }
-
- cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
-
- // Calculate the number of bits to be spent on the key frame
- cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
-
- // If the key frame is actually easier than the average for the
- // kf group (which does sometimes happen... eg a blank intro frame)
- // Then use an alternate calculation based on the kf error score
- // which should give a smaller key frame.
- if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {
- double alt_kf_grp_bits =
- ((double)cpi->twopass.bits_left *
- (kf_mod_err * (double)cpi->twopass.frames_to_key) /
- DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
-
- alt_kf_bits = (int)((double)kf_boost *
- (alt_kf_grp_bits / (double)allocation_chunks));
-
- if (cpi->twopass.kf_bits > alt_kf_bits) {
- cpi->twopass.kf_bits = alt_kf_bits;
- }
- }
- // Else if it is much harder than other frames in the group make sure
- // it at least receives an allocation in keeping with its relative
- // error score
- else {
- alt_kf_bits =
- (int)((double)cpi->twopass.bits_left *
- (kf_mod_err /
- DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
-
- if (alt_kf_bits > cpi->twopass.kf_bits) {
- cpi->twopass.kf_bits = alt_kf_bits;
- }
- }
-
- cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
- cpi->twopass.kf_bits += cpi->min_frame_bandwidth; // Add in the minimum frame allowance
-
- cpi->per_frame_bandwidth = cpi->twopass.kf_bits; // Peer frame bit target for this frame
- cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate; // Convert to a per second bitrate
- }
-
- // Note the total error score of the kf group minus the key frame itself
- cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
-
- // Adjust the count of total modified error left.
- // The count of bits left is adjusted elsewhere based on real coded frame sizes
- cpi->twopass.modified_error_left -= kf_group_err;
-}
--- a/vp8/encoder/firstpass.h
+++ /dev/null
@@ -1,23 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined __INC_FIRSTPASS_H
-#define __INC_FIRSTPASS_H
-
-extern void vp9_init_first_pass(VP9_COMP *cpi);
-extern void vp9_first_pass(VP9_COMP *cpi);
-extern void vp9_end_first_pass(VP9_COMP *cpi);
-
-extern void vp9_init_second_pass(VP9_COMP *cpi);
-extern void vp9_second_pass(VP9_COMP *cpi);
-extern void vp9_end_second_pass(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/generic/csystemdependent.c
+++ /dev/null
@@ -1,48 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi);
-void vp9_arch_arm_encoder_init(VP9_COMP *cpi);
-
-void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc,
- int fraction);
-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc,
- int fraction);
-
-void vp9_cmachine_specific_config(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
- cpi->rtcd.common = &cpi->common.rtcd;
-
- cpi->rtcd.search.full_search = vp9_full_search_sad;
- cpi->rtcd.search.refining_search = vp9_refining_search_sad;
- cpi->rtcd.search.diamond_search = vp9_diamond_search_sad;
- cpi->rtcd.temporal.apply = vp9_temporal_filter_apply_c;
-#endif
-
- vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;
-
-#if ARCH_X86 || ARCH_X86_64
- vp9_arch_x86_encoder_init(cpi);
-#endif
-
-#if ARCH_ARM
- vp9_arch_arm_encoder_init(cpi);
-#endif
-
-
-}
--- a/vp8/encoder/lookahead.c
+++ /dev/null
@@ -1,191 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "lookahead.h"
-#include "vp8/common/extend.h"
-
-#define MAX_LAG_BUFFERS 25
-
-struct lookahead_ctx {
- unsigned int max_sz; /* Absolute size of the queue */
- unsigned int sz; /* Number of buffers currently in the queue */
- unsigned int read_idx; /* Read index */
- unsigned int write_idx; /* Write index */
- struct lookahead_entry *buf; /* Buffer list */
-};
-
-
-/* Return the buffer at the given absolute index and increment the index */
-static struct lookahead_entry *
-pop(struct lookahead_ctx *ctx,
- unsigned int *idx) {
- unsigned int index = *idx;
- struct lookahead_entry *buf = ctx->buf + index;
-
- assert(index < ctx->max_sz);
- if (++index >= ctx->max_sz)
- index -= ctx->max_sz;
- *idx = index;
- return buf;
-}
-
-
-void
-vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
- if (ctx) {
- if (ctx->buf) {
- int i;
-
- for (i = 0; i < ctx->max_sz; i++)
- vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
- free(ctx->buf);
- }
- free(ctx);
- }
-}
-
-
-struct lookahead_ctx *
-vp9_lookahead_init(unsigned int width,
- unsigned int height,
- unsigned int depth) {
- struct lookahead_ctx *ctx = NULL;
- int i;
-
- /* Clamp the lookahead queue depth */
- if (depth < 1)
- depth = 1;
- else if (depth > MAX_LAG_BUFFERS)
- depth = MAX_LAG_BUFFERS;
-
- /* Align the buffer dimensions */
- width = (width + 15) &~15;
- height = (height + 15) &~15;
-
- /* Allocate the lookahead structures */
- ctx = calloc(1, sizeof(*ctx));
- if (ctx) {
- ctx->max_sz = depth;
- ctx->buf = calloc(depth, sizeof(*ctx->buf));
- if (!ctx->buf)
- goto bail;
- for (i = 0; i < depth; i++)
- if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
- width, height, VP8BORDERINPIXELS))
- goto bail;
- }
- return ctx;
-bail:
- vp9_lookahead_destroy(ctx);
- return NULL;
-}
-
-
-int
-vp9_lookahead_push(struct lookahead_ctx *ctx,
- YV12_BUFFER_CONFIG *src,
- int64_t ts_start,
- int64_t ts_end,
- unsigned int flags,
- unsigned char *active_map) {
- struct lookahead_entry *buf;
- int row, col, active_end;
- int mb_rows = (src->y_height + 15) >> 4;
- int mb_cols = (src->y_width + 15) >> 4;
-
- if (ctx->sz + 1 > ctx->max_sz)
- return 1;
- ctx->sz++;
- buf = pop(ctx, &ctx->write_idx);
-
- // Only do this partial copy if the following conditions are all met:
- // 1. Lookahead queue has has size of 1.
- // 2. Active map is provided.
- // 3. This is not a key frame, golden nor altref frame.
- if (ctx->max_sz == 1 && active_map && !flags) {
- for (row = 0; row < mb_rows; ++row) {
- col = 0;
-
- while (1) {
- // Find the first active macroblock in this row.
- for (; col < mb_cols; ++col) {
- if (active_map[col])
- break;
- }
-
- // No more active macroblock in this row.
- if (col == mb_cols)
- break;
-
- // Find the end of active region in this row.
- active_end = col;
-
- for (; active_end < mb_cols; ++active_end) {
- if (!active_map[active_end])
- break;
- }
-
- // Only copy this active region.
- vp9_copy_and_extend_frame_with_rect(src, &buf->img,
- row << 4,
- col << 4, 16,
- (active_end - col) << 4);
-
- // Start again from the end of this active region.
- col = active_end;
- }
-
- active_map += mb_cols;
- }
- } else {
- vp9_copy_and_extend_frame(src, &buf->img);
- }
- buf->ts_start = ts_start;
- buf->ts_end = ts_end;
- buf->flags = flags;
- return 0;
-}
-
-
-struct lookahead_entry *
-vp9_lookahead_pop(struct lookahead_ctx *ctx,
- int drain) {
- struct lookahead_entry *buf = NULL;
-
- if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
- buf = pop(ctx, &ctx->read_idx);
- ctx->sz--;
- }
- return buf;
-}
-
-
-struct lookahead_entry *
-vp9_lookahead_peek(struct lookahead_ctx *ctx,
- int index) {
- struct lookahead_entry *buf = NULL;
-
- assert(index < ctx->max_sz);
- if (index < ctx->sz) {
- index += ctx->read_idx;
- if (index >= ctx->max_sz)
- index -= ctx->max_sz;
- buf = ctx->buf + index;
- }
- return buf;
-}
-
-
-unsigned int
-vp9_lookahead_depth(struct lookahead_ctx *ctx) {
- return ctx->sz;
-}
--- a/vp8/encoder/lookahead.h
+++ /dev/null
@@ -1,105 +1,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef LOOKAHEAD_H
-#define LOOKAHEAD_H
-#include "vpx_scale/yv12config.h"
-#include "vpx/vpx_integer.h"
-
-struct lookahead_entry {
- YV12_BUFFER_CONFIG img;
- int64_t ts_start;
- int64_t ts_end;
- unsigned int flags;
-};
-
-
-struct lookahead_ctx;
-
-/**\brief Initializes the lookahead stage
- *
- * The lookahead stage is a queue of frame buffers on which some analysis
- * may be done when buffers are enqueued.
- *
- *
- */
-struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
- unsigned int height,
- unsigned int depth
- );
-
-
-/**\brief Destroys the lookahead stage
- *
- */
-void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
-
-
-/**\brief Enqueue a source buffer
- *
- * This function will copy the source image into a new framebuffer with
- * the expected stride/border.
- *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
- * \param[in] ctx Pointer to the lookahead context
- * \param[in] src Pointer to the image to enqueue
- * \param[in] ts_start Timestamp for the start of this frame
- * \param[in] ts_end Timestamp for the end of this frame
- * \param[in] flags Flags set on this frame
- * \param[in] active_map Map that specifies which macroblock is active
- */
-int
-vp9_lookahead_push(struct lookahead_ctx *ctx,
- YV12_BUFFER_CONFIG *src,
- int64_t ts_start,
- int64_t ts_end,
- unsigned int flags,
- unsigned char *active_map);
-
-
-/**\brief Get the next source buffer to encode
- *
- *
- * \param[in] ctx Pointer to the lookahead context
- * \param[in] drain Flag indicating the buffer should be drained
- * (return a buffer regardless of the current queue depth)
- *
- * \retval NULL, if drain set and queue is empty
- * \retval NULL, if drain not set and queue not of the configured depth
- *
- */
-struct lookahead_entry *
-vp9_lookahead_pop(struct lookahead_ctx *ctx,
- int drain);
-
-
-/**\brief Get a future source buffer to encode
- *
- * \param[in] ctx Pointer to the lookahead context
- * \param[in] index Index of the frame to be returned, 0 == next frame
- *
- * \retval NULL, if no buffer exists at the specified index
- *
- */
-struct lookahead_entry *
-vp9_lookahead_peek(struct lookahead_ctx *ctx,
- int index);
-
-
-/**\brief Get the number of frames currently in the lookahead queue
- *
- * \param[in] ctx Pointer to the lookahead context
- */
-unsigned int
-vp9_lookahead_depth(struct lookahead_ctx *ctx);
-
-
-#endif
--- a/vp8/encoder/mbgraph.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <limits.h>
-#include <vp8/encoder/encodeintra.h>
-#include <vp8/encoder/rdopt.h>
-#include <vp8/common/setupintrarecon.h>
-#include <vp8/common/blockd.h>
-#include <vp8/common/reconinter.h>
-#include <vp8/common/systemdependent.h>
-#include <vpx_mem/vpx_mem.h>
-#include <vp8/encoder/segmentation.h>
-
-static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
- int_mv *ref_mv,
- int_mv *dst_mv) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- BLOCK *b = &x->block[0];
- BLOCKD *d = &xd->block[0];
- vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
- unsigned int best_err;
- int step_param, further_steps;
-
- int tmp_col_min = x->mv_col_min;
- int tmp_col_max = x->mv_col_max;
- int tmp_row_min = x->mv_row_min;
- int tmp_row_max = x->mv_row_max;
- int_mv ref_full;
-
- // Further step/diamond searches as necessary
- if (cpi->Speed < 8) {
- step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
- further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
- } else {
- step_param = cpi->sf.first_step + 2;
- further_steps = 0;
- }
-
- vp9_clamp_mv_min_max(x, ref_mv);
-
- ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
- ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
-
- /*cpi->sf.search_method == HEX*/
- best_err = vp9_hex_search(
- x, b, d,
- &ref_full, dst_mv,
- step_param,
- x->errorperbit,
- &v_fn_ptr,
- NULLMVCOST,
- NULLMVCOST,
- ref_mv);
-
- // Try sub-pixel MC
- // if (bestsme > error_thresh && bestsme < INT_MAX)
- {
- int distortion;
- unsigned int sse;
- best_err = cpi->find_fractional_mv_step(
- x, b, d,
- dst_mv, ref_mv,
- x->errorperbit, &v_fn_ptr,
- NULLMVCOST,
- & distortion, &sse);
- }
-
-#if CONFIG_PRED_FILTER
- // Disable the prediction filter
- xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
-
- vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
- vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
- best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
- xd->predictor, 16, INT_MAX);
-
- /* restore UMV window */
- x->mv_col_min = tmp_col_min;
- x->mv_col_max = tmp_col_max;
- x->mv_row_min = tmp_row_min;
- x->mv_row_max = tmp_row_max;
-
- return best_err;
-}
-
-static int do_16x16_motion_search
-(
- VP9_COMP *cpi,
- int_mv *ref_mv,
- int_mv *dst_mv,
- YV12_BUFFER_CONFIG *buf,
- int buf_mb_y_offset,
- YV12_BUFFER_CONFIG *ref,
- int mb_y_offset
-) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- unsigned int err, tmp_err;
- int_mv tmp_mv;
- int n;
-
- for (n = 0; n < 16; n++) {
- BLOCKD *d = &xd->block[n];
- BLOCK *b = &x->block[n];
-
- b->base_src = &buf->y_buffer;
- b->src_stride = buf->y_stride;
- b->src = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
- d->base_pre = &ref->y_buffer;
- d->pre_stride = ref->y_stride;
- d->pre = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
- }
-
- // Try zero MV first
- // FIXME should really use something like near/nearest MV and/or MV prediction
- xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
- xd->pre.y_stride = ref->y_stride;
- err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
- xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
- dst_mv->as_int = 0;
-
- // Test last reference frame using the previous best mv as the
- // starting point (best reference) for the search
- tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
- if (tmp_err < err) {
- err = tmp_err;
- dst_mv->as_int = tmp_mv.as_int;
- }
-
- // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
- if (ref_mv->as_int) {
- int tmp_err;
- int_mv zero_ref_mv, tmp_mv;
-
- zero_ref_mv.as_int = 0;
- tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
- if (tmp_err < err) {
- dst_mv->as_int = tmp_mv.as_int;
- err = tmp_err;
- }
- }
-
- return err;
-}
-
-static int do_16x16_zerozero_search
-(
- VP9_COMP *cpi,
- int_mv *dst_mv,
- YV12_BUFFER_CONFIG *buf,
- int buf_mb_y_offset,
- YV12_BUFFER_CONFIG *ref,
- int mb_y_offset
-) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- unsigned int err;
- int n;
-
- for (n = 0; n < 16; n++) {
- BLOCKD *d = &xd->block[n];
- BLOCK *b = &x->block[n];
-
- b->base_src = &buf->y_buffer;
- b->src_stride = buf->y_stride;
- b->src = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
- d->base_pre = &ref->y_buffer;
- d->pre_stride = ref->y_stride;
- d->pre = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
- }
-
- // Try zero MV first
- // FIXME should really use something like near/nearest MV and/or MV prediction
- xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
- xd->pre.y_stride = ref->y_stride;
- // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
- err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
- xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
-
- dst_mv->as_int = 0;
-
- return err;
-}
-static int find_best_16x16_intra
-(
- VP9_COMP *cpi,
- YV12_BUFFER_CONFIG *buf,
- int mb_y_offset,
- MB_PREDICTION_MODE *pbest_mode
-) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_PREDICTION_MODE best_mode = -1, mode;
- int best_err = INT_MAX;
-
- // calculate SATD for each intra prediction mode;
- // we're intentionally not doing 4x4, we just want a rough estimate
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
- unsigned int err;
-
- xd->mode_info_context->mbmi.mode = mode;
- vp9_build_intra_predictors_mby(xd);
- err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
- buf->y_stride, best_err);
- // find best
- if (err < best_err) {
- best_err = err;
- best_mode = mode;
- }
- }
-
- if (pbest_mode)
- *pbest_mode = best_mode;
-
- return best_err;
-}
-
-static void update_mbgraph_mb_stats
-(
- VP9_COMP *cpi,
- MBGRAPH_MB_STATS *stats,
- YV12_BUFFER_CONFIG *buf,
- int mb_y_offset,
- YV12_BUFFER_CONFIG *golden_ref,
- int_mv *prev_golden_ref_mv,
- int gld_y_offset,
- YV12_BUFFER_CONFIG *alt_ref,
- int_mv *prev_alt_ref_mv,
- int arf_y_offset
-) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- int intra_error;
-
- // FIXME in practice we're completely ignoring chroma here
- xd->dst.y_buffer = buf->y_buffer + mb_y_offset;
-
- // do intra 16x16 prediction
- intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);
- if (intra_error <= 0)
- intra_error = 1;
- stats->ref[INTRA_FRAME].err = intra_error;
-
- // Golden frame MV search, if it exists and is different than last frame
- if (golden_ref) {
- int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
- &stats->ref[GOLDEN_FRAME].m.mv,
- buf, mb_y_offset,
- golden_ref, gld_y_offset);
- stats->ref[GOLDEN_FRAME].err = g_motion_error;
- } else {
- stats->ref[GOLDEN_FRAME].err = INT_MAX;
- stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
- }
-
- // Alt-ref frame MV search, if it exists and is different than last/golden frame
- if (alt_ref) {
- // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,
- // &stats->ref[ALTREF_FRAME].m.mv,
- // buf, mb_y_offset,
- // alt_ref, arf_y_offset);
-
- int a_motion_error =
- do_16x16_zerozero_search(cpi,
- &stats->ref[ALTREF_FRAME].m.mv,
- buf, mb_y_offset,
- alt_ref, arf_y_offset);
-
- stats->ref[ALTREF_FRAME].err = a_motion_error;
- } else {
- stats->ref[ALTREF_FRAME].err = INT_MAX;
- stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
- }
-}
-
-static void update_mbgraph_frame_stats
-(
- VP9_COMP *cpi,
- MBGRAPH_FRAME_STATS *stats,
- YV12_BUFFER_CONFIG *buf,
- YV12_BUFFER_CONFIG *golden_ref,
- YV12_BUFFER_CONFIG *alt_ref
-) {
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
- int mb_col, mb_row, offset = 0;
- int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
- int_mv arf_top_mv, gld_top_mv;
- MODE_INFO mi_local;
-
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- arf_top_mv.as_int = 0;
- gld_top_mv.as_int = 0;
- x->mv_row_min = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
- x->mv_row_max = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
- xd->up_available = 0;
- xd->dst.y_stride = buf->y_stride;
- xd->pre.y_stride = buf->y_stride;
- xd->dst.uv_stride = buf->uv_stride;
- xd->mode_info_context = &mi_local;
-
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- int_mv arf_left_mv, gld_left_mv;
- int mb_y_in_offset = mb_y_offset;
- int arf_y_in_offset = arf_y_offset;
- int gld_y_in_offset = gld_y_offset;
-
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
- arf_left_mv.as_int = arf_top_mv.as_int;
- gld_left_mv.as_int = gld_top_mv.as_int;
- x->mv_col_min = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
- x->mv_col_max = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
- xd->left_available = 0;
-
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
-
- update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
- golden_ref, &gld_left_mv, gld_y_in_offset,
- alt_ref, &arf_left_mv, arf_y_in_offset);
- arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
- gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
- if (mb_col == 0) {
- arf_top_mv.as_int = arf_left_mv.as_int;
- gld_top_mv.as_int = gld_left_mv.as_int;
- }
- xd->left_available = 1;
- mb_y_in_offset += 16;
- gld_y_in_offset += 16;
- arf_y_in_offset += 16;
- x->mv_col_min -= 16;
- x->mv_col_max -= 16;
- }
- xd->up_available = 1;
- mb_y_offset += buf->y_stride * 16;
- gld_y_offset += golden_ref->y_stride * 16;
- if (alt_ref)
- arf_y_offset += alt_ref->y_stride * 16;
- x->mv_row_min -= 16;
- x->mv_row_max -= 16;
- offset += cm->mb_cols;
- }
-}
-
-// void separate_arf_mbs_byzz
-static void separate_arf_mbs(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- int mb_col, mb_row, offset, i;
- int ncnt[4];
- int n_frames = cpi->mbgraph_n_frames;
-
- int *arf_not_zz;
-
- CHECK_MEM_ERROR(arf_not_zz,
- vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
-
- vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));
-
- // We are not interested in results beyond the alt ref itself.
- if (n_frames > cpi->frames_till_gf_update_due)
- n_frames = cpi->frames_till_gf_update_due;
-
- // defer cost to reference frames
- for (i = n_frames - 1; i >= 0; i--) {
- MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-
- for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
- offset += cm->mb_cols, mb_row++) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- MBGRAPH_MB_STATS *mb_stats =
- &frame_stats->mb_stats[offset + mb_col];
-
- int altref_err = mb_stats->ref[ALTREF_FRAME].err;
- int intra_err = mb_stats->ref[INTRA_FRAME ].err;
- int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
-
- // Test for altref vs intra and gf and that its mv was 0,0.
- if ((altref_err > 1000) ||
- (altref_err > intra_err) ||
- (altref_err > golden_err)) {
- arf_not_zz[offset + mb_col]++;
- }
- }
- }
- }
-
- vpx_memset(ncnt, 0, sizeof(ncnt));
- for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
- offset += cm->mb_cols, mb_row++) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
- // If any of the blocks in the sequence failed then the MB
- // goes in segment 0
- if (arf_not_zz[offset + mb_col]) {
- ncnt[0]++;
- cpi->segmentation_map[offset + mb_col] = 0;
- } else {
- ncnt[1]++;
- cpi->segmentation_map[offset + mb_col] = 1;
- }
- }
- }
-
- // Only bother with segmentation if over 10% of the MBs in static segment
- // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
- if (1) {
- // Note % of blocks that are marked as static
- if (cm->MBs)
- cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
-
- // This error case should not be reachable as this function should
- // never be called with the common data structure unititialized.
- else
- cpi->static_mb_pct = 0;
-
- cpi->seg0_cnt = ncnt[0];
- vp9_enable_segmentation((VP9_PTR) cpi);
- } else {
- cpi->static_mb_pct = 0;
- vp9_disable_segmentation((VP9_PTR) cpi);
- }
-
- // Free localy allocated storage
- vpx_free(arf_not_zz);
-}
-
-void vp9_update_mbgraph_stats
-(
- VP9_COMP *cpi
-) {
- VP9_COMMON *const cm = &cpi->common;
- int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
- YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];
-
- // we need to look ahead beyond where the ARF transitions into
- // being a GF - so exit if we don't look ahead beyond that
- if (n_frames <= cpi->frames_till_gf_update_due)
- return;
- if (n_frames > cpi->common.frames_till_alt_ref_frame)
- n_frames = cpi->common.frames_till_alt_ref_frame;
- if (n_frames > MAX_LAG_BUFFERS)
- n_frames = MAX_LAG_BUFFERS;
-
- cpi->mbgraph_n_frames = n_frames;
- for (i = 0; i < n_frames; i++) {
- MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
- vpx_memset(frame_stats->mb_stats, 0,
- cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
- }
-
- // do motion search to find contribution of each reference to data
- // later on in this GF group
- // FIXME really, the GF/last MC search should be done forward, and
- // the ARF MC search backwards, to get optimal results for MV caching
- for (i = 0; i < n_frames; i++) {
- MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
- struct lookahead_entry *q_cur =
- vp9_lookahead_peek(cpi->lookahead, i);
-
- assert(q_cur != NULL);
-
- update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
- golden_ref, cpi->Source);
- }
-
- vp9_clear_system_state(); // __asm emms;
-
- separate_arf_mbs(cpi);
-}
--- a/vp8/encoder/mbgraph.h
+++ /dev/null
@@ -1,16 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_MBGRAPH_H__
-#define __INC_MBGRAPH_H__ 1
-
-extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
-
-#endif /* __INC_MBGRAPH_H__ */
--- a/vp8/encoder/mcomp.c
+++ /dev/null
@@ -1,2203 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/onyx_int.h"
-#include "mcomp.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/config.h"
-#include <stdio.h>
-#include <limits.h>
-#include <math.h>
-#include "vp8/common/findnearmv.h"
-
-#ifdef ENTROPY_STATS
-static int mv_ref_ct [31] [4] [2];
-static int mv_mode_cts [4] [2];
-#endif
-
-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
- int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.col & 7) ? 1 : 0);
- int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.row & 7) ? 1 : 0);
- int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
- int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
-
- /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
- if (x->mv_col_min < col_min)
- x->mv_col_min = col_min;
- if (x->mv_col_max > col_max)
- x->mv_col_max = col_max;
- if (x->mv_row_min < row_min)
- x->mv_row_min = row_min;
- if (x->mv_row_max > row_max)
- x->mv_row_max = row_max;
-}
-
-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
- int Weight, int ishp) {
- MV v;
- v.row = (mv->as_mv.row - ref->as_mv.row);
- v.col = (mv->as_mv.col - ref->as_mv.col);
- return ((mvjcost[vp9_get_mv_joint(v)] +
- mvcost[0][v.row] + mvcost[1][v.col]) *
- Weight) >> 7;
-}
-
-static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
- int error_per_bit, int ishp) {
- if (mvcost) {
- MV v;
- v.row = (mv->as_mv.row - ref->as_mv.row);
- v.col = (mv->as_mv.col - ref->as_mv.col);
- return ((mvjcost[vp9_get_mv_joint(v)] +
- mvcost[0][v.row] + mvcost[1][v.col]) *
- error_per_bit + 128) >> 8;
- }
- return 0;
-}
-
-static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,
- int error_per_bit) {
-
- if (mvsadcost) {
- MV v;
- v.row = (mv->as_mv.row - ref->as_mv.row);
- v.col = (mv->as_mv.col - ref->as_mv.col);
- return ((mvjsadcost[vp9_get_mv_joint(v)] +
- mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
- error_per_bit + 128) >> 8;
- }
- return 0;
-}
-
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
- int Len;
- int search_site_count = 0;
-
-
- // Generate offsets for 4 search sites per step.
- Len = MAX_FIRST_STEP;
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = 0;
- search_site_count++;
-
- while (Len > 0) {
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = -Len;
- x->ss[search_site_count].offset = -Len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = Len;
- x->ss[search_site_count].offset = Len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -Len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = -Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = Len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = Len;
- search_site_count++;
-
- // Contract.
- Len /= 2;
- }
-
- x->ss_count = search_site_count;
- x->searches_per_step = 4;
-}
-
-void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
- int Len;
- int search_site_count = 0;
-
- // Generate offsets for 8 search sites per step.
- Len = MAX_FIRST_STEP;
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = 0;
- search_site_count++;
-
- while (Len > 0) {
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = -Len;
- x->ss[search_site_count].offset = -Len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = 0;
- x->ss[search_site_count].mv.row = Len;
- x->ss[search_site_count].offset = Len * stride;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -Len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = -Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = Len;
- x->ss[search_site_count].mv.row = 0;
- x->ss[search_site_count].offset = Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -Len;
- x->ss[search_site_count].mv.row = -Len;
- x->ss[search_site_count].offset = -Len * stride - Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = Len;
- x->ss[search_site_count].mv.row = -Len;
- x->ss[search_site_count].offset = -Len * stride + Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = -Len;
- x->ss[search_site_count].mv.row = Len;
- x->ss[search_site_count].offset = Len * stride - Len;
- search_site_count++;
-
- // Compute offsets for search sites.
- x->ss[search_site_count].mv.col = Len;
- x->ss[search_site_count].mv.row = Len;
- x->ss[search_site_count].offset = Len * stride + Len;
- search_site_count++;
-
- // Contract.
- Len /= 2;
- }
-
- x->ss_count = search_site_count;
- x->searches_per_step = 8;
-}
-
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
-/* estimated cost of a motion vector (r,c) */
-#define MVC(r, c) \
- (mvcost ? \
- ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + \
- mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
- error_per_bit + 128) >> 8 : 0)
-
-#define SP(x) (((x) & 7) << 1) // convert motion vector component to offset
- // for svf calc
-
-#define IFMVCV(r, c, s, e) \
- if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
- s \
- else \
- e;
-
-/* pointer to predictor base of a motionvector */
-#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
-
-/* returns subpixel variance error function */
-#define DIST(r, c) \
- vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
-
-/* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c) \
- IFMVCV(r, c, { \
- thismse = (DIST(r, c)); \
- if ((v = MVC(r, c) + thismse) < besterr) { \
- besterr = v; \
- br = r; \
- bc = c; \
- *distortion = thismse; \
- *sse1 = sse; \
- } \
- }, \
- v = INT_MAX;)
-
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- DEC_MVCOSTS,
- int *distortion,
- unsigned int *sse1) {
- unsigned char *z = (*(b->base_src) + b->src);
- MACROBLOCKD *xd = &x->e_mbd;
-
- int rr, rc, br, bc, hstep;
- int tr, tc;
- unsigned int besterr = INT_MAX;
- unsigned int left, right, up, down, diag;
- unsigned int sse;
- unsigned int whichdir;
- unsigned int halfiters = 4;
- unsigned int quarteriters = 4;
- unsigned int eighthiters = 4;
- int thismse;
- int maxc, minc, maxr, minr;
- int y_stride;
- int offset;
- int usehp = xd->allow_high_precision_mv;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
- unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- unsigned char *y;
- int buf_r1, buf_r2, buf_c1, buf_c2;
-
- // Clamping to avoid out-of-range data access
- buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?
- (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;
- buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?
- (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;
- buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?
- (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;
- buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?
- (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;
- y_stride = 32;
-
- /* Copy to intermediate buffer before searching. */
- vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);
- y = xd->y_buf + y_stride * buf_r1 + buf_c1;
-#else
- unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- y_stride = d->pre_stride;
-#endif
-
- rr = ref_mv->as_mv.row;
- rc = ref_mv->as_mv.col;
- br = bestmv->as_mv.row << 3;
- bc = bestmv->as_mv.col << 3;
- hstep = 4;
- minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
- maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
- minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
- maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
-
- tr = br;
- tc = bc;
-
-
- offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
-
- // central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
-
- // calculate central point error
- besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,
- error_per_bit, xd->allow_high_precision_mv);
-
- // TODO: Each subsequent iteration checks at least one point in
- // common with the last iteration could be 2 ( if diag selected)
- while (--halfiters) {
- // 1/2 pel
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
- tr = br;
- tc = bc;
- }
-
- // TODO: Each subsequent iteration checks at least one point in common with
- // the last iteration could be 2 ( if diag selected) 1/4 pel
- hstep >>= 1;
- while (--quarteriters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
- tr = br;
- tc = bc;
- }
-
- if (xd->allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
- } else {
- usehp = 0;
- }
-
- if (usehp) {
- hstep >>= 1;
- while (--eighthiters) {
- CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(down, tr + hstep, tc);
-
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
- switch (whichdir) {
- case 0:
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
- break;
- case 1:
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
- break;
- case 2:
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
- break;
- case 3:
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
- break;
- }
-
- // no reason to check the same one again.
- if (tr == br && tc == bc)
- break;
-
- tr = br;
- tc = bc;
- }
- }
- bestmv->as_mv.row = br;
- bestmv->as_mv.col = bc;
-
- if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
- (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
- return INT_MAX;
-
- return besterr;
-}
-#undef MVC
-#undef PRE
-#undef DIST
-#undef IFMVCV
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
-
-int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- DEC_MVCOSTS, int *distortion,
- unsigned int *sse1) {
- int bestmse = INT_MAX;
- int_mv startmv;
- int_mv this_mv;
- int_mv orig_mv;
- int yrow_movedback = 0, ycol_movedback = 0;
- unsigned char *z = (*(b->base_src) + b->src);
- int left, right, up, down, diag;
- unsigned int sse;
- int whichdir;
- int thismse;
- int y_stride;
- MACROBLOCKD *xd = &x->e_mbd;
- int usehp = xd->allow_high_precision_mv;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
- unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- unsigned char *y;
-
- y_stride = 32;
- /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
- vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
- y = xd->y_buf + y_stride + 1;
-#else
- unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- y_stride = d->pre_stride;
-#endif
-
- // central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
- startmv = *bestmv;
- orig_mv = *bestmv;
-
- // calculate central point error
- bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
- *distortion = bestmse;
- bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
- this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
- left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
- this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
- up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
-
-
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
- // for(whichdir =0;whichdir<4;whichdir++)
- // {
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
- break;
- case 1:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
- break;
- case 2:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
- break;
- case 3:
- default:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
-// }
-
-
- // time to check quarter pels.
- if (bestmv->as_mv.row < startmv.as_mv.row) {
- y -= y_stride;
- yrow_movedback = 1;
- }
-
- if (bestmv->as_mv.col < startmv.as_mv.col) {
- y--;
- ycol_movedback = 1;
- }
-
- startmv = *bestmv;
-
-
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col = startmv.as_mv.col - 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
- b->src_stride, &sse);
- }
-
- left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 4;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row = startmv.as_mv.row - 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
- z, b->src_stride, &sse);
- }
-
- up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 4;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
-
-
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-// for(whichdir=0;whichdir<4;whichdir++)
-// {
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 2;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
- }
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);
- }
- }
-
- break;
- case 1:
- this_mv.as_mv.col += 2;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
- }
-
- break;
- case 2:
- this_mv.as_mv.row += 2;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
- thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
- b->src_stride, &sse);
- }
-
- break;
- case 3:
- this_mv.as_mv.col += 2;
- this_mv.as_mv.row += 2;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- if (x->e_mbd.allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
- } else {
- usehp = 0;
- }
- if (!usehp)
- return bestmse;
-
- /* Now do 1/8th pixel */
- if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
- y -= y_stride;
- yrow_movedback = 1;
- }
-
- if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
- y--;
- ycol_movedback = 1;
- }
-
- startmv = *bestmv;
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col = startmv.as_mv.col - 1;
- thismse = vfp->svf(y, y_stride,
- SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- }
-
- left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
- z, b->src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row = startmv.as_mv.row - 1;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
- }
-
- up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 2;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // now check 1 more diagonal
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-// for(whichdir=0;whichdir<4;whichdir++)
-// {
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 1;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
- }
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);
- }
- }
-
- break;
- case 1:
- this_mv.as_mv.col += 1;
-
- if (startmv.as_mv.row & 7) {
- this_mv.as_mv.row -= 1;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
- thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
- }
-
- break;
- case 2:
- this_mv.as_mv.row += 1;
-
- if (startmv.as_mv.col & 7) {
- this_mv.as_mv.col -= 1;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- } else {
- this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
- thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- }
-
- break;
- case 3:
- this_mv.as_mv.col += 1;
- this_mv.as_mv.row += 1;
- thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- return bestmse;
-}
-
-#undef SP
-
-int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- DEC_MVCOSTS,
- int *distortion,
- unsigned int *sse1) {
- int bestmse = INT_MAX;
- int_mv startmv;
- int_mv this_mv;
- unsigned char *z = (*(b->base_src) + b->src);
- int left, right, up, down, diag;
- unsigned int sse;
- int whichdir;
- int thismse;
- int y_stride;
- MACROBLOCKD *xd = &x->e_mbd;
-
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
- unsigned char *y0 = *(d->base_pre) + d->pre +
- (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- unsigned char *y;
-
- y_stride = 32;
- /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
- vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
- y = xd->y_buf + y_stride + 1;
-#else
- unsigned char *y = *(d->base_pre) + d->pre +
- (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
- y_stride = d->pre_stride;
-#endif
-
- // central mv
- bestmv->as_mv.row <<= 3;
- bestmv->as_mv.col <<= 3;
- startmv = *bestmv;
-
- // calculate central point error
- bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
- *distortion = bestmse;
- bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- // go left then right and check error
- this_mv.as_mv.row = startmv.as_mv.row;
- this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
- left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (left < bestmse) {
- *bestmv = this_mv;
- bestmse = left;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
- right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (right < bestmse) {
- *bestmv = this_mv;
- bestmse = right;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // go up then down and check error
- this_mv.as_mv.col = startmv.as_mv.col;
- this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
- up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (up < bestmse) {
- *bestmv = this_mv;
- bestmse = up;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
- down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (down < bestmse) {
- *bestmv = this_mv;
- bestmse = down;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- // now check 1 more diagonal -
- whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
- this_mv = startmv;
-
- switch (whichdir) {
- case 0:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
- break;
- case 1:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
- break;
- case 2:
- this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
- break;
- case 3:
- default:
- this_mv.as_mv.col += 4;
- this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
- break;
- }
-
- diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
- xd->allow_high_precision_mv);
-
- if (diag < bestmse) {
- *bestmv = this_mv;
- bestmse = diag;
- *distortion = thismse;
- *sse1 = sse;
- }
-
- return bestmse;
-}
-
-#define CHECK_BOUNDS(range) \
- {\
- all_in = 1;\
- all_in &= ((br-range) >= x->mv_row_min);\
- all_in &= ((br+range) <= x->mv_row_max);\
- all_in &= ((bc-range) >= x->mv_col_min);\
- all_in &= ((bc+range) <= x->mv_col_max);\
- }
-
-#define CHECK_POINT \
- {\
- if (this_mv.as_mv.col < x->mv_col_min) continue;\
- if (this_mv.as_mv.col > x->mv_col_max) continue;\
- if (this_mv.as_mv.row < x->mv_row_min) continue;\
- if (this_mv.as_mv.row > x->mv_row_max) continue;\
- }
-
-#define CHECK_BETTER \
- {\
- if (thissad < bestsad)\
- {\
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\
- if (thissad < bestsad)\
- {\
- bestsad = thissad;\
- best_site = i;\
- }\
- }\
- }
-
-static const MV next_chkpts[6][3] = {
- {{ -2, 0}, { -1, -2}, {1, -2}},
- {{ -1, -2}, {1, -2}, {2, 0}},
- {{1, -2}, {2, 0}, {1, 2}},
- {{2, 0}, {1, 2}, { -1, 2}},
- {{1, 2}, { -1, 2}, { -2, 0}},
- {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-
-int vp9_hex_search
-(
- MACROBLOCK *x,
- BLOCK *b,
- BLOCKD *d,
- int_mv *ref_mv,
- int_mv *best_mv,
- int search_param,
- int sad_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- DEC_MVSADCOSTS,
- DEC_MVCOSTS,
- int_mv *center_mv
-) {
- MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
- MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
- int i, j;
-
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- int in_what_stride = d->pre_stride;
- int br, bc;
- int_mv this_mv;
- unsigned int bestsad = 0x7fffffff;
- unsigned int thissad;
- unsigned char *base_offset;
- unsigned char *this_offset;
- int k = -1;
- int all_in;
- int best_site = -1;
-
- int_mv fcenter_mv;
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- // adjust ref_mv to make sure it is within MV range
- clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
- br = ref_mv->as_mv.row;
- bc = ref_mv->as_mv.col;
-
- // Work out the start point for the search
- base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
- this_offset = base_offset + (br * (d->pre_stride)) + bc;
- this_mv.as_mv.row = br;
- this_mv.as_mv.col = bc;
- bestsad = vfp->sdf(what, what_stride, this_offset,
- in_what_stride, 0x7fffffff)
- + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // hex search
- // j=0
- CHECK_BOUNDS(2)
-
- if (all_in) {
- for (i = 0; i < 6; i++) {
- this_mv.as_mv.row = br + hex[i].row;
- this_mv.as_mv.col = bc + hex[i].col;
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 6; i++) {
- this_mv.as_mv.row = br + hex[i].row;
- this_mv.as_mv.col = bc + hex[i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- }
-
- if (best_site == -1)
- goto cal_neighbors;
- else {
- br += hex[best_site].row;
- bc += hex[best_site].col;
- k = best_site;
- }
-
- for (j = 1; j < 127; j++) {
- best_site = -1;
- CHECK_BOUNDS(2)
-
- if (all_in) {
- for (i = 0; i < 3; i++) {
- this_mv.as_mv.row = br + next_chkpts[k][i].row;
- this_mv.as_mv.col = bc + next_chkpts[k][i].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 3; i++) {
- this_mv.as_mv.row = br + next_chkpts[k][i].row;
- this_mv.as_mv.col = bc + next_chkpts[k][i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- }
-
- if (best_site == -1)
- break;
- else {
- br += next_chkpts[k][best_site].row;
- bc += next_chkpts[k][best_site].col;
- k += 5 + best_site;
- if (k >= 12) k -= 12;
- else if (k >= 6) k -= 6;
- }
- }
-
- // check 4 1-away neighbors
-cal_neighbors:
- for (j = 0; j < 32; j++) {
- best_site = -1;
- CHECK_BOUNDS(1)
-
- if (all_in) {
- for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 4; i++) {
- this_mv.as_mv.row = br + neighbors[i].row;
- this_mv.as_mv.col = bc + neighbors[i].col;
- CHECK_POINT
- this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
- thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
- CHECK_BETTER
- }
- }
-
- if (best_site == -1)
- break;
- else {
- br += neighbors[best_site].row;
- bc += neighbors[best_site].col;
- }
- }
-
- best_mv->as_mv.row = br;
- best_mv->as_mv.col = bc;
-
- return bestsad;
-}
-#undef CHECK_BOUNDS
-#undef CHECK_POINT
-#undef CHECK_BETTER
-
-int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *ref_mv, int_mv *best_mv,
- int search_param, int sad_per_bit, int *num00,
- vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
- int_mv *center_mv) {
- int i, j, step;
-
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int in_what_stride = d->pre_stride;
- unsigned char *best_address;
-
- int tot_steps;
- int_mv this_mv;
-
- int bestsad = INT_MAX;
- int best_site = 0;
- int last_site = 0;
-
- int ref_row, ref_col;
- int this_row_offset, this_col_offset;
- search_site *ss;
-
- unsigned char *check_here;
- int thissad;
- MACROBLOCKD *xd = &x->e_mbd;
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
- ref_row = ref_mv->as_mv.row;
- ref_col = ref_mv->as_mv.col;
- *num00 = 0;
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- // Work out the start point for the search
- in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
- best_address = in_what;
-
- // Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride, in_what,
- in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
- ss = &x->ss[search_param * x->searches_per_step];
- tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
- i = 1;
-
- for (step = 0; step < tot_steps; step++) {
- for (j = 0; j < x->searches_per_step; j++) {
- // Trap illegal vectors
- this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
- this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
-
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
- {
- check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.row = this_row_offset;
- this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_site = i;
- }
- }
- }
-
- i++;
- }
-
- if (best_site != last_site) {
- best_mv->as_mv.row += ss[best_site].mv.row;
- best_mv->as_mv.col += ss[best_site].mv.col;
- best_address += ss[best_site].offset;
- last_site = best_site;
- } else if (best_address == in_what)
- (*num00)++;
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- if (bestsad == INT_MAX)
- return INT_MAX;
-
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
-}
-
-int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *ref_mv, int_mv *best_mv, int search_param,
- int sad_per_bit, int *num00,
- vp9_variance_fn_ptr_t *fn_ptr,
- DEC_MVCOSTS, int_mv *center_mv) {
- int i, j, step;
-
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int in_what_stride = d->pre_stride;
- unsigned char *best_address;
-
- int tot_steps;
- int_mv this_mv;
-
- int bestsad = INT_MAX;
- int best_site = 0;
- int last_site = 0;
-
- int ref_row;
- int ref_col;
- int this_row_offset;
- int this_col_offset;
- search_site *ss;
-
- unsigned char *check_here;
- unsigned int thissad;
- MACROBLOCKD *xd = &x->e_mbd;
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
- ref_row = ref_mv->as_mv.row;
- ref_col = ref_mv->as_mv.col;
- *num00 = 0;
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- // Work out the start point for the search
- in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
- best_address = in_what;
-
- // Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride,
- in_what, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
- ss = &x->ss[search_param * x->searches_per_step];
- tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
- i = 1;
-
- for (step = 0; step < tot_steps; step++) {
- int all_in = 1, t;
-
- // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
- // checking 4 bounds for each points.
- all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
- all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
- all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
- all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
-
- if (all_in) {
- unsigned int sad_array[4];
-
- for (j = 0; j < x->searches_per_step; j += 4) {
- unsigned char *block_offset[4];
-
- for (t = 0; t < 4; t++)
- block_offset[t] = ss[i + t].offset + best_address;
-
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
- sad_array);
-
- for (t = 0; t < 4; t++, i++) {
- if (sad_array[t] < bestsad) {
- this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
- this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
- sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (sad_array[t] < bestsad) {
- bestsad = sad_array[t];
- best_site = i;
- }
- }
- }
- }
- } else {
- for (j = 0; j < x->searches_per_step; j++) {
- // Trap illegal vectors
- this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
- this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
-
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.row = this_row_offset;
- this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_site = i;
- }
- }
- }
- i++;
- }
- }
-
- if (best_site != last_site) {
- best_mv->as_mv.row += ss[best_site].mv.row;
- best_mv->as_mv.col += ss[best_site].mv.col;
- best_address += ss[best_site].offset;
- last_site = best_site;
- } else if (best_address == in_what)
- (*num00)++;
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- if (bestsad == INT_MAX)
- return INT_MAX;
-
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
-}
-
-/* do_refine: If last step (1-away) of n-step search doesn't pick the center
- point as the best match, we will do a final 1-away diamond
- refining search */
-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
- BLOCKD *d, int_mv *mvp_full, int step_param,
- int sadpb, int further_steps,
- int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
- int_mv *ref_mv, int_mv *dst_mv) {
- int_mv temp_mv;
- int thissme, n, num00;
- int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
- step_param, sadpb, &num00,
- fn_ptr, XMVCOST, ref_mv);
- dst_mv->as_int = temp_mv.as_int;
-
- n = num00;
- num00 = 0;
-
- /* If there won't be more n-step search, check to see if refining search is needed. */
- if (n > further_steps)
- do_refine = 0;
-
- while (n < further_steps) {
- n++;
-
- if (num00)
- num00--;
- else {
- thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
- step_param + n, sadpb, &num00,
- fn_ptr, XMVCOST, ref_mv);
-
- /* check to see if refining search is needed. */
- if (num00 > (further_steps - n))
- do_refine = 0;
-
- if (thissme < bestsme) {
- bestsme = thissme;
- dst_mv->as_int = temp_mv.as_int;
- }
- }
- }
-
- /* final 1-away diamond refining search */
- if (do_refine == 1) {
- int search_range = 8;
- int_mv best_mv;
- best_mv.as_int = dst_mv->as_int;
- thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,
- fn_ptr, XMVCOST, ref_mv);
-
- if (thissme < bestsme) {
- bestsme = thissme;
- dst_mv->as_int = best_mv.as_int;
- }
- }
- return bestsme;
-}
-
-int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
- int_mv *center_mv) {
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int in_what_stride = d->pre_stride;
- int mv_stride = d->pre_stride;
- unsigned char *bestaddress;
- int_mv *best_mv = &d->bmi.as_mv.first;
- int_mv this_mv;
- int bestsad = INT_MAX;
- int r, c;
-
- unsigned char *check_here;
- int thissad;
- MACROBLOCKD *xd = &x->e_mbd;
-
- int ref_row = ref_mv->as_mv.row;
- int ref_col = ref_mv->as_mv.col;
-
- int row_min = ref_row - distance;
- int row_max = ref_row + distance;
- int col_min = ref_col - distance;
- int col_max = ref_col + distance;
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- // Work out the mid point for the search
- in_what = *(d->base_pre) + d->pre;
- bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- // Baseline value at the centre
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
- in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
-
- for (r = row_min; r < row_max; r++) {
- this_mv.as_mv.row = r;
- check_here = r * mv_stride + in_what + col_min;
-
- for (c = col_min; c < col_max; c++) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
-
- check_here++;
- }
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
- else
- return INT_MAX;
-}
-
-int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
- int_mv *center_mv) {
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int in_what_stride = d->pre_stride;
- int mv_stride = d->pre_stride;
- unsigned char *bestaddress;
- int_mv *best_mv = &d->bmi.as_mv.first;
- int_mv this_mv;
- int bestsad = INT_MAX;
- int r, c;
-
- unsigned char *check_here;
- unsigned int thissad;
- MACROBLOCKD *xd = &x->e_mbd;
-
- int ref_row = ref_mv->as_mv.row;
- int ref_col = ref_mv->as_mv.col;
-
- int row_min = ref_row - distance;
- int row_max = ref_row + distance;
- int col_min = ref_col - distance;
- int col_max = ref_col + distance;
-
- unsigned int sad_array[3];
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- // Work out the mid point for the search
- in_what = *(d->base_pre) + d->pre;
- bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- // Baseline value at the centre
- bestsad = fn_ptr->sdf(what, what_stride,
- bestaddress, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
-
- for (r = row_min; r < row_max; r++) {
- this_mv.as_mv.row = r;
- check_here = r * mv_stride + in_what + col_min;
- c = col_min;
-
- while ((c + 2) < col_max) {
- int i;
-
- fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
- for (i = 0; i < 3; i++) {
- thissad = sad_array[i];
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
-
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
- else
- return INT_MAX;
-}
-
-int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int sad_per_bit, int distance,
- vp9_variance_fn_ptr_t *fn_ptr,
- DEC_MVCOSTS,
- int_mv *center_mv) {
- unsigned char *what = (*(b->base_src) + b->src);
- int what_stride = b->src_stride;
- unsigned char *in_what;
- int in_what_stride = d->pre_stride;
- int mv_stride = d->pre_stride;
- unsigned char *bestaddress;
- int_mv *best_mv = &d->bmi.as_mv.first;
- int_mv this_mv;
- int bestsad = INT_MAX;
- int r, c;
-
- unsigned char *check_here;
- unsigned int thissad;
- MACROBLOCKD *xd = &x->e_mbd;
-
- int ref_row = ref_mv->as_mv.row;
- int ref_col = ref_mv->as_mv.col;
-
- int row_min = ref_row - distance;
- int row_max = ref_row + distance;
- int col_min = ref_col - distance;
- int col_max = ref_col + distance;
-
- DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
- unsigned int sad_array[3];
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- // Work out the mid point for the search
- in_what = *(d->base_pre) + d->pre;
- bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
- best_mv->as_mv.row = ref_row;
- best_mv->as_mv.col = ref_col;
-
- // Baseline value at the centre
- bestsad = fn_ptr->sdf(what, what_stride,
- bestaddress, in_what_stride, 0x7fffffff)
- + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
-
- // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
- if (col_min < x->mv_col_min)
- col_min = x->mv_col_min;
-
- if (col_max > x->mv_col_max)
- col_max = x->mv_col_max;
-
- if (row_min < x->mv_row_min)
- row_min = x->mv_row_min;
-
- if (row_max > x->mv_row_max)
- row_max = x->mv_row_max;
-
- for (r = row_min; r < row_max; r++) {
- this_mv.as_mv.row = r;
- check_here = r * mv_stride + in_what + col_min;
- c = col_min;
-
- while ((c + 7) < col_max) {
- int i;
-
- fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
- for (i = 0; i < 8; i++) {
- thissad = (unsigned int)sad_array8[i];
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while ((c + 2) < col_max) {
- int i;
-
- fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
- for (i = 0; i < 3; i++) {
- thissad = sad_array[i];
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.col = c;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
- MVSADCOSTS, sad_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_mv->as_mv.row = r;
- best_mv->as_mv.col = c;
- bestaddress = check_here;
- }
- }
-
- check_here++;
- c++;
- }
- }
-
- this_mv.as_mv.row = best_mv->as_mv.row << 3;
- this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
- if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
- else
- return INT_MAX;
-}
-
-int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
- int error_per_bit, int search_range,
- vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
- int_mv *center_mv) {
- MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
- int i, j;
- short this_row_offset, this_col_offset;
-
- int what_stride = b->src_stride;
- int in_what_stride = d->pre_stride;
- unsigned char *what = (*(b->base_src) + b->src);
- unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
- (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
- unsigned char *check_here;
- unsigned int thissad;
- int_mv this_mv;
- unsigned int bestsad = INT_MAX;
- MACROBLOCKD *xd = &x->e_mbd;
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
- mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
- for (i = 0; i < search_range; i++) {
- int best_site = -1;
-
- for (j = 0; j < 4; j++) {
- this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
- this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.row = this_row_offset;
- this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_site = j;
- }
- }
- }
- }
-
- if (best_site == -1)
- break;
- else {
- ref_mv->as_mv.row += neighbors[best_site].row;
- ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
- }
- }
-
- this_mv.as_mv.row = ref_mv->as_mv.row << 3;
- this_mv.as_mv.col = ref_mv->as_mv.col << 3;
-
- if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
- else
- return INT_MAX;
-}
-
-int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
- int_mv *ref_mv, int error_per_bit,
- int search_range, vp9_variance_fn_ptr_t *fn_ptr,
- DEC_MVCOSTS, int_mv *center_mv) {
- MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
- int i, j;
- short this_row_offset, this_col_offset;
-
- int what_stride = b->src_stride;
- int in_what_stride = d->pre_stride;
- unsigned char *what = (*(b->base_src) + b->src);
- unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
- (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
- unsigned char *check_here;
- unsigned int thissad;
- int_mv this_mv;
- unsigned int bestsad = INT_MAX;
- MACROBLOCKD *xd = &x->e_mbd;
- int_mv fcenter_mv;
-
- int *mvjsadcost = x->nmvjointsadcost;
- int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
- fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
- fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
- bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
- mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
- for (i = 0; i < search_range; i++) {
- int best_site = -1;
- int all_in = 1;
-
- all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
- all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
- all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
- all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
-
- if (all_in) {
- unsigned int sad_array[4];
- unsigned char *block_offset[4];
- block_offset[0] = best_address - in_what_stride;
- block_offset[1] = best_address - 1;
- block_offset[2] = best_address + 1;
- block_offset[3] = best_address + in_what_stride;
-
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
-
- for (j = 0; j < 4; j++) {
- if (sad_array[j] < bestsad) {
- this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
- this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
- sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
- if (sad_array[j] < bestsad) {
- bestsad = sad_array[j];
- best_site = j;
- }
- }
- }
- } else {
- for (j = 0; j < 4; j++) {
- this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
- this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
-
- if (thissad < bestsad) {
- this_mv.as_mv.row = this_row_offset;
- this_mv.as_mv.col = this_col_offset;
- thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
-
- if (thissad < bestsad) {
- bestsad = thissad;
- best_site = j;
- }
- }
- }
- }
- }
-
- if (best_site == -1)
- break;
- else {
- ref_mv->as_mv.row += neighbors[best_site].row;
- ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
- }
- }
-
- this_mv.as_mv.row = ref_mv->as_mv.row << 3;
- this_mv.as_mv.col = ref_mv->as_mv.col << 3;
-
- if (bestsad < INT_MAX)
- return
- fn_ptr->vf(what, what_stride, best_address, in_what_stride,
- (unsigned int *)(&thissad)) +
- mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
- xd->allow_high_precision_mv);
- else
- return INT_MAX;
-}
-
-
-
-#ifdef ENTROPY_STATS
-void print_mode_context(void) {
- FILE *f = fopen("modecont.c", "a");
- int i, j;
-
- fprintf(f, "#include \"entropy.h\"\n");
- fprintf(f, "const int vp9_mode_contexts[6][4] =");
- fprintf(f, "{\n");
- for (j = 0; j < 6; j++) {
- fprintf(f, " {/* %d */ ", j);
- fprintf(f, " ");
- for (i = 0; i < 4; i++) {
- int this_prob;
- int count;
-
- // context probs
- count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
- if (count)
- this_prob = 256 * mv_ref_ct[j][i][0] / count;
- else
- this_prob = 128;
-
- if (this_prob == 0)
- this_prob = 1;
- fprintf(f, "%5d, ", this_prob);
- }
- fprintf(f, " },\n");
- }
-
- fprintf(f, "};\n");
- fclose(f);
-}
-
-/* MV ref count ENTROPY_STATS stats code */
-void init_mv_ref_counts() {
- vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
- vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
- if (m == ZEROMV) {
- ++mv_ref_ct [ct[0]] [0] [0];
- ++mv_mode_cts[0][0];
- } else {
- ++mv_ref_ct [ct[0]] [0] [1];
- ++mv_mode_cts[0][1];
-
- if (m == NEARESTMV) {
- ++mv_ref_ct [ct[1]] [1] [0];
- ++mv_mode_cts[1][0];
- } else {
- ++mv_ref_ct [ct[1]] [1] [1];
- ++mv_mode_cts[1][1];
-
- if (m == NEARMV) {
- ++mv_ref_ct [ct[2]] [2] [0];
- ++mv_mode_cts[2][0];
- } else {
- ++mv_ref_ct [ct[2]] [2] [1];
- ++mv_mode_cts[2][1];
-
- if (m == NEWMV) {
- ++mv_ref_ct [ct[3]] [3] [0];
- ++mv_mode_cts[3][0];
- } else {
- ++mv_ref_ct [ct[3]] [3] [1];
- ++mv_mode_cts[3][1];
- }
- }
- }
- }
-}
-
-#endif/* END MV ref count ENTROPY_STATS stats code */
--- a/vp8/encoder/mcomp.h
+++ /dev/null
@@ -1,159 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MCOMP_H
-#define __INC_MCOMP_H
-
-#include "block.h"
-#include "variance.h"
-
-#define MVCOSTS mvjcost, mvcost
-#define MVSADCOSTS mvjsadcost, mvsadcost
-#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
-#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]
-#define NULLMVCOST NULL, NULL
-#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)
-
-#ifdef ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
-#endif
-
-
-#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Max full pel mv specified in 1 pel units
-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units
-
-extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
- int Weight, int ishp);
-extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp9_init3smotion_compensation(MACROBLOCK *x, int stride);
-// Runs sequence of diamond searches in smaller steps for RD
-struct VP9_COMP;
-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
- BLOCKD *d, int_mv *mvp_full, int step_param,
- int sadpb, int further_steps, int do_refine,
- vp9_variance_fn_ptr_t *fn_ptr,
- int_mv *ref_mv, int_mv *dst_mv);
-
-extern int vp9_hex_search
-(
- MACROBLOCK *x,
- BLOCK *b,
- BLOCKD *d,
- int_mv *ref_mv,
- int_mv *best_mv,
- int search_param,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vf,
- DEC_MVSADCOSTS,
- DEC_MVCOSTS,
- int_mv *center_mv
-);
-
-typedef int (fractional_mv_step_fp)
-(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
- int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,
- int *distortion, unsigned int *sse);
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
-
-#define prototype_full_search_sad(sym)\
- int (sym)\
- (\
- MACROBLOCK *x, \
- BLOCK *b, \
- BLOCKD *d, \
- int_mv *ref_mv, \
- int sad_per_bit, \
- int distance, \
- vp9_variance_fn_ptr_t *fn_ptr, \
- DEC_MVSADCOSTS, \
- int_mv *center_mv \
- )
-
-#define prototype_refining_search_sad(sym)\
- int (sym)\
- (\
- MACROBLOCK *x, \
- BLOCK *b, \
- BLOCKD *d, \
- int_mv *ref_mv, \
- int sad_per_bit, \
- int distance, \
- vp9_variance_fn_ptr_t *fn_ptr, \
- DEC_MVSADCOSTS, \
- int_mv *center_mv \
- )
-
-#define prototype_diamond_search_sad(sym)\
- int (sym)\
- (\
- MACROBLOCK *x, \
- BLOCK *b, \
- BLOCKD *d, \
- int_mv *ref_mv, \
- int_mv *best_mv, \
- int search_param, \
- int sad_per_bit, \
- int *num00, \
- vp9_variance_fn_ptr_t *fn_ptr, \
- DEC_MVSADCOSTS, \
- int_mv *center_mv \
- )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/mcomp_x86.h"
-#endif
-
-typedef prototype_full_search_sad(*vp9_full_search_fn_t);
-extern prototype_full_search_sad(vp9_full_search_sad);
-extern prototype_full_search_sad(vp9_full_search_sadx3);
-extern prototype_full_search_sad(vp9_full_search_sadx8);
-
-typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);
-extern prototype_refining_search_sad(vp9_refining_search_sad);
-extern prototype_refining_search_sad(vp9_refining_search_sadx4);
-
-typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);
-extern prototype_diamond_search_sad(vp9_diamond_search_sad);
-extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);
-
-#ifndef vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sad
-#endif
-extern prototype_full_search_sad(vp9_search_full_search);
-
-#ifndef vp9_search_refining_search
-#define vp9_search_refining_search vp9_refining_search_sad
-#endif
-extern prototype_refining_search_sad(vp9_search_refining_search);
-
-#ifndef vp9_search_diamond_search
-#define vp9_search_diamond_search vp9_diamond_search_sad
-#endif
-extern prototype_diamond_search_sad(vp9_search_diamond_search);
-
-typedef struct {
- prototype_full_search_sad(*full_search);
- prototype_refining_search_sad(*refining_search);
- prototype_diamond_search_sad(*diamond_search);
-} vp9_search_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn
-#endif
-
-#endif
--- a/vp8/encoder/modecosts.c
+++ /dev/null
@@ -1,49 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "vp8/common/entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
- VP9_COMMON *x = &c->common;
- const vp9_tree_p T = vp9_bmode_tree;
- int i, j;
-
- for (i = 0; i < VP9_BINTRAMODES; i++) {
- for (j = 0; j < VP9_BINTRAMODES; j++) {
- vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
- x->kf_bmode_prob[i][j], T);
- }
- }
-
- vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
- vp9_cost_tokens((int *)c->mb.inter_bmode_costs,
- x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
-
- vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
- vp9_cost_tokens(c->mb.mbmode_cost[0],
- x->kf_ymode_prob[c->common.kf_ymode_probs_index],
- vp9_kf_ymode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
- x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
- vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
- x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
- vp9_cost_tokens(c->mb.i8x8_mode_costs,
- x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
-
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
- vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
- x->fc.switchable_interp_prob[i],
- vp9_switchable_interp_tree);
-}
--- a/vp8/encoder/modecosts.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_MODECOSTS_H
-#define __INC_MODECOSTS_H
-
-void vp9_init_mode_costs(VP9_COMP *x);
-
-#endif
--- a/vp8/encoder/onyx_if.c
+++ /dev/null
@@ -1,4486 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "vp8/common/systemdependent.h"
-#include "quantize.h"
-#include "vp8/common/alloccommon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "psnr.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/extend.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vpx_scale/yv12extend.h"
-#if CONFIG_POSTPROC
-#include "vp8/common/postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include "vpx_ports/vpx_timer.h"
-#include "temporal_filter.h"
-
-#include "vp8/common/seg_common.h"
-#include "mbgraph.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/encoder/rdopt.h"
-#include "bitstream.h"
-#include "ratectrl.h"
-
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-#include <math.h>
-#include <stdio.h>
-#include <limits.h>
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#define RTCD(x) &cpi->common.rtcd.x
-#else
-#define IF_RTCD(x) NULL
-#define RTCD(x) NULL
-#endif
-
-extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
-
-extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);
-
-extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
-
-extern void vp9_cmachine_specific_config(VP9_COMP *cpi);
-
-extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *post,
- int filt_lvl, int low_var_thresh, int flag);
-
-extern void print_tree_update_probs();
-
-#if HAVE_ARMV7
-extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc);
-
-extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc);
-#endif
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-
-extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);
-
-static void set_default_lf_deltas(VP9_COMP *cpi);
-
-#define DEFAULT_INTERP_FILTER EIGHTTAP /* SWITCHABLE for better performance */
-#define SEARCH_BEST_FILTER 0 /* to search exhaustively for
- best filter */
-#define RESET_FOREACH_FILTER 0 /* whether to reset the encoder state
- before trying each new filter */
-#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
-
-#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv
- for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200 /* Q threshold for use of high precision
- mv. Choose a very high value for
- now so that HIGH_PRECISION is always
- chosen */
-
-#if CONFIG_INTERNAL_STATS
-#include "math.h"
-
-extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest, int lumamask,
- double *weight);
-
-
-extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest, double *ssim_y,
- double *ssim_u, double *ssim_v);
-
-
-#endif
-
-// #define OUTPUT_YUV_REC
-
-#ifdef OUTPUT_YUV_SRC
-FILE *yuv_file;
-#endif
-#ifdef OUTPUT_YUV_REC
-FILE *yuv_rec_file;
-#endif
-
-#if 0
-FILE *framepsnr;
-FILE *kf_list;
-FILE *keyfile;
-#endif
-
-#if 0
-extern int skip_true_count;
-extern int skip_false_count;
-#endif
-
-
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
-#endif
-
-#ifdef NMV_STATS
-extern void init_nmvstats();
-extern void print_nmvstats();
-#endif
-
-#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-extern unsigned __int64 Sectionbits[500];
-#endif
-#ifdef MODE_STATS
-extern INT64 Sectionbits[500];
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int i8x8_modes[VP9_I8X8_MODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
-
-extern void vp9_init_quantizer(VP9_COMP *cpi);
-
-static int base_skip_false_prob[QINDEX_RANGE][3];
-
-// Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int gf_low_motion_minq[QINDEX_RANGE];
-static int gf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-
-// Functions to compute the active minq lookup table entries based on a
-// formulaic approach to facilitate easier adjustment of the Q tables.
-// The formulae were derived from computing a 3rd order polynomial best
-// fit to the original data (after plotting real maxq vs minq (not q index))
-static int calculate_minq_index(double maxq,
- double x3, double x2, double x, double c) {
- int i;
- double minqtarget;
- double thisq;
-
- minqtarget = ((x3 * maxq * maxq * maxq) +
- (x2 * maxq * maxq) +
- (x * maxq) +
- c);
-
- if (minqtarget > maxq)
- minqtarget = maxq;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- thisq = vp9_convert_qindex_to_q(i);
- if (minqtarget <= vp9_convert_qindex_to_q(i))
- return i;
- }
- return QINDEX_RANGE - 1;
-}
-
-static void init_minq_luts(void) {
- int i;
- double maxq;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- maxq = vp9_convert_qindex_to_q(i);
-
-
- kf_low_motion_minq[i] = calculate_minq_index(maxq,
- 0.0000003,
- -0.000015,
- 0.074,
- 0.0);
- kf_high_motion_minq[i] = calculate_minq_index(maxq,
- 0.0000004,
- -0.000125,
- 0.14,
- 0.0);
- gf_low_motion_minq[i] = calculate_minq_index(maxq,
- 0.0000015,
- -0.0009,
- 0.33,
- 0.0);
- gf_high_motion_minq[i] = calculate_minq_index(maxq,
- 0.0000021,
- -0.00125,
- 0.45,
- 0.0);
- inter_minq[i] = calculate_minq_index(maxq,
- 0.00000271,
- -0.00113,
- 0.697,
- 0.0);
-
- }
-}
-
-static void init_base_skip_probs(void) {
- int i;
- double q;
- int skip_prob, t;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- q = vp9_convert_qindex_to_q(i);
-
- // Exponential decay caluclation of baseline skip prob with clamping
- // Based on crude best fit of old table.
- t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
-
- skip_prob = t;
- if (skip_prob < 1)
- skip_prob = 1;
- else if (skip_prob > 255)
- skip_prob = 255;
- base_skip_false_prob[i][1] = skip_prob;
-
- skip_prob = t * 0.75;
- if (skip_prob < 1)
- skip_prob = 1;
- else if (skip_prob > 255)
- skip_prob = 255;
- base_skip_false_prob[i][2] = skip_prob;
-
- skip_prob = t * 1.25;
- if (skip_prob < 1)
- skip_prob = 1;
- else if (skip_prob > 255)
- skip_prob = 255;
- base_skip_false_prob[i][0] = skip_prob;
- }
-}
-
-static void update_base_skip_probs(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- if (cm->frame_type != KEY_FRAME) {
- vp9_update_skip_probs(cpi);
-
- if (cm->refresh_alt_ref_frame) {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
- cpi->last_skip_probs_q[2] = cm->base_qindex;
- } else if (cpi->common.refresh_golden_frame) {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
- cpi->last_skip_probs_q[1] = cm->base_qindex;
- } else {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];
- cpi->last_skip_probs_q[0] = cm->base_qindex;
-
- // update the baseline table for the current q
- for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- cpi->base_skip_false_prob[cm->base_qindex][k] =
- cm->mbskip_pred_probs[k];
- }
- }
-
-}
-
-void vp9_initialize_enc() {
- static int init_done = 0;
-
- if (!init_done) {
- vp8_scale_machine_specific_config();
- vp9_initialize_common();
- vp9_tokenize_initialize();
- vp9_init_quant_tables();
- vp9_init_me_luts();
- init_minq_luts();
- init_base_skip_probs();
- init_done = 1;
- }
-}
-#ifdef PACKET_TESTING
-extern FILE *vpxlogc;
-#endif
-
-static void setup_features(VP9_COMP *cpi) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- // Set up default state for MB feature flags
-
- xd->segmentation_enabled = 0; // Default segmentation disabled
-
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
-
- vp9_clearall_segfeatures(xd);
-
- xd->mode_ref_lf_delta_enabled = 0;
- xd->mode_ref_lf_delta_update = 0;
- vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
- vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
-
- set_default_lf_deltas(cpi);
-
-}
-
-
-static void dealloc_compressor_data(VP9_COMP *cpi) {
- vpx_free(cpi->tplist);
- cpi->tplist = NULL;
-
- // Delete last frame MV storage buffers
- vpx_free(cpi->lfmv);
- cpi->lfmv = 0;
-
- vpx_free(cpi->lf_ref_frame_sign_bias);
- cpi->lf_ref_frame_sign_bias = 0;
-
- vpx_free(cpi->lf_ref_frame);
- cpi->lf_ref_frame = 0;
-
- // Delete sementation map
- vpx_free(cpi->segmentation_map);
- cpi->segmentation_map = 0;
- vpx_free(cpi->common.last_frame_seg_map);
- cpi->common.last_frame_seg_map = 0;
- vpx_free(cpi->coding_context.last_frame_seg_map_copy);
- cpi->coding_context.last_frame_seg_map_copy = 0;
-
- vpx_free(cpi->active_map);
- cpi->active_map = 0;
-
- vp9_de_alloc_frame_buffers(&cpi->common);
-
- vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
- vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-#if VP9_TEMPORAL_ALT_REF
- vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
-#endif
- vp9_lookahead_destroy(cpi->lookahead);
-
- vpx_free(cpi->tok);
- cpi->tok = 0;
-
- // Structure used to monitor GF usage
- vpx_free(cpi->gf_active_flags);
- cpi->gf_active_flags = 0;
-
- // Activity mask based per mb zbin adjustments
- vpx_free(cpi->mb_activity_map);
- cpi->mb_activity_map = 0;
- vpx_free(cpi->mb_norm_activity_map);
- cpi->mb_norm_activity_map = 0;
-
- vpx_free(cpi->mb.pip);
- cpi->mb.pip = 0;
-
- vpx_free(cpi->twopass.total_stats);
- cpi->twopass.total_stats = 0;
-
- vpx_free(cpi->twopass.total_left_stats);
- cpi->twopass.total_left_stats = 0;
-
- vpx_free(cpi->twopass.this_frame_stats);
- cpi->twopass.this_frame_stats = 0;
-}
-
-// Computes a q delta (in "q index" terms) to get from a starting q value
-// to a target value
-// target q value
-static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
- int i;
- int start_index = cpi->worst_quality;
- int target_index = cpi->worst_quality;
-
- // Convert the average q value to an index.
- for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
- start_index = i;
- if (vp9_convert_qindex_to_q(i) >= qstart)
- break;
- }
-
- // Convert the q target to an index
- for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
- target_index = i;
- if (vp9_convert_qindex_to_q(i) >= qtarget)
- break;
- }
-
- return target_index - start_index;
-}
-
-static void init_seg_features(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- int high_q = (int)(cpi->avg_q > 48.0);
- int qi_delta;
-
- // Disable and clear down for KF
- if (cm->frame_type == KEY_FRAME) {
- // Clear down the global segmentation map
- vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- cpi->static_mb_pct = 0;
-
- // Disable segmentation
- vp9_disable_segmentation((VP9_PTR)cpi);
-
- // Clear down the segment features.
- vp9_clearall_segfeatures(xd);
- }
-
- // If this is an alt ref frame
- else if (cm->refresh_alt_ref_frame) {
- // Clear down the global segmentation map
- vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- cpi->static_mb_pct = 0;
-
- // Disable segmentation and individual segment features by default
- vp9_disable_segmentation((VP9_PTR)cpi);
- vp9_clearall_segfeatures(xd);
-
- // Scan frames from current to arf frame.
- // This function re-enables segmentation if appropriate.
- vp9_update_mbgraph_stats(cpi);
-
- // If segmentation was enabled set those features needed for the
- // arf itself.
- if (xd->segmentation_enabled) {
- xd->update_mb_segmentation_map = 1;
- xd->update_mb_segmentation_data = 1;
-
- qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
-
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
-
- // Where relevant assume segment data is delta data
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
- }
- }
- // All other frames if segmentation has been enabled
- else if (xd->segmentation_enabled) {
- // First normal frame in a valid gf or alt ref group
- if (cpi->common.frames_since_golden == 0) {
- // Set up segment features for normal frames in an af group
- if (cpi->source_alt_ref_active) {
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 1;
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
-
- qi_delta = compute_qdelta(cpi, cpi->avg_q,
- (cpi->avg_q * 1.125));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
-
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
-
- // Segment coding disabled for compred testing
- if (high_q || (cpi->static_mb_pct == 100)) {
- // set_segref(xd, 1, LAST_FRAME);
- vp9_set_segref(xd, 1, ALTREF_FRAME);
- vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
-
- vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
- vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
-
- // EOB segment coding not fixed for 8x8 yet
- vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
- vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
- }
- }
- // Disable segmentation and clear down features if alt ref
- // is not active for this group
- else {
- vp9_disable_segmentation((VP9_PTR)cpi);
-
- vpx_memset(cpi->segmentation_map, 0,
- (cm->mb_rows * cm->mb_cols));
-
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
-
- vp9_clearall_segfeatures(xd);
- }
- }
-
- // Special case where we are coding over the top of a previous
- // alt ref frame
- // Segment coding disabled for compred testing
- else if (cpi->is_src_frame_alt_ref) {
- // Enable mode and ref frame features for segment 0 as well
- vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
- vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);
- vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
- vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
-
- // All mbs should use ALTREF_FRAME, ZEROMV exclusively
- vp9_clear_segref(xd, 0);
- vp9_set_segref(xd, 0, ALTREF_FRAME);
- vp9_clear_segref(xd, 1);
- vp9_set_segref(xd, 1, ALTREF_FRAME);
- vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);
- vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
-
- // Skip all MBs if high Q
- if (high_q) {
- vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);
- vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);
- vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
- vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
- }
- // Enable data udpate
- xd->update_mb_segmentation_data = 1;
- }
- // All other frames.
- else {
- // No updates.. leave things as they are.
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- }
- }
-}
-
-// DEBUG: Print out the segment id of each MB in the current frame.
-static void print_seg_map(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- int row, col;
- int map_index = 0;
- FILE *statsfile;
-
- statsfile = fopen("segmap.stt", "a");
-
- fprintf(statsfile, "%10d\n",
- cm->current_video_frame);
-
- for (row = 0; row < cpi->common.mb_rows; row++) {
- for (col = 0; col < cpi->common.mb_cols; col++) {
- fprintf(statsfile, "%10d",
- cpi->segmentation_map[map_index]);
- map_index++;
- }
- fprintf(statsfile, "\n");
- }
- fprintf(statsfile, "\n");
-
- fclose(statsfile);
-}
-
-static void update_reference_segmentation_map(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
- MODE_INFO *mi = cm->mi;
- uint8_t *segmap = cpi->segmentation_map;
- uint8_t *segcache = cm->last_frame_seg_map;
-
- for (row = 0; row < sb_rows; row++) {
- for (col = 0; col < sb_cols; col++) {
- MODE_INFO *miptr = mi + col * 2;
- uint8_t *cache = segcache + col * 2;
-#if CONFIG_SUPERBLOCKS
- if (miptr->mbmi.encoded_as_sb) {
- cache[0] = miptr->mbmi.segment_id;
- if (!(cm->mb_cols & 1) || col < sb_cols - 1)
- cache[1] = miptr->mbmi.segment_id;
- if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
- cache[cm->mb_cols] = miptr->mbmi.segment_id;
- if (!(cm->mb_cols & 1) || col < sb_cols - 1)
- cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
- }
- } else
-#endif
- {
- cache[0] = miptr[0].mbmi.segment_id;
- if (!(cm->mb_cols & 1) || col < sb_cols - 1)
- cache[1] = miptr[1].mbmi.segment_id;
- if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
- cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
- if (!(cm->mb_cols & 1) || col < sb_cols - 1)
- cache[1] = miptr[1].mbmi.segment_id;
- cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
- }
- }
- }
- segmap += 2 * cm->mb_cols;
- segcache += 2 * cm->mb_cols;
- mi += 2 * cm->mode_info_stride;
- }
-}
-
-static void set_default_lf_deltas(VP9_COMP *cpi) {
- cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
- cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
-
- vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
-
- // Test of ref frame deltas
- cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
- cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
- cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
- cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
-
- cpi->mb.e_mbd.mode_lf_deltas[0] = 4; // BPRED
- cpi->mb.e_mbd.mode_lf_deltas[1] = -2; // Zero
- cpi->mb.e_mbd.mode_lf_deltas[2] = 2; // New mv
- cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv
-}
-
-void vp9_set_speed_features(VP9_COMP *cpi) {
- SPEED_FEATURES *sf = &cpi->sf;
- int Mode = cpi->compressor_speed;
- int Speed = cpi->Speed;
- int i;
- VP9_COMMON *cm = &cpi->common;
-
- // Only modes 0 and 1 supported for now in experimental code basae
- if (Mode > 1)
- Mode = 1;
-
- // Initialise default mode frequency sampling variables
- for (i = 0; i < MAX_MODES; i ++) {
- cpi->mode_check_freq[i] = 0;
- cpi->mode_test_hit_counts[i] = 0;
- cpi->mode_chosen_counts[i] = 0;
- }
-
- // best quality defaults
- sf->RD = 1;
- sf->search_method = NSTEP;
- sf->improved_dct = 1;
- sf->auto_filter = 1;
- sf->recode_loop = 1;
- sf->quarter_pixel_search = 1;
- sf->half_pixel_search = 1;
- sf->iterative_sub_pixel = 1;
-#if CONFIG_LOSSLESS
- sf->optimize_coefficients = 0;
-#else
- sf->optimize_coefficients = 1;
-#endif
- sf->no_skip_block4x4_search = 1;
-
- sf->first_step = 0;
- sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
- sf->improved_mv_pred = 1;
-
- // default thresholds to 0
- for (i = 0; i < MAX_MODES; i++)
- sf->thresh_mult[i] = 0;
-
- switch (Mode) {
- case 0: // best quality mode
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROG_FILT ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_ZEROA_FILT ] = 0;
- sf->thresh_mult[THR_NEARESTMV ] = 0;
- sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARMV_FILT ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARG_FILT ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
- sf->thresh_mult[THR_NEARA_FILT ] = 0;
-
- sf->thresh_mult[THR_DC ] = 0;
-
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_D45_PRED ] = 1000;
- sf->thresh_mult[THR_D135_PRED] = 1000;
- sf->thresh_mult[THR_D117_PRED] = 1000;
- sf->thresh_mult[THR_D153_PRED] = 1000;
- sf->thresh_mult[THR_D27_PRED ] = 1000;
- sf->thresh_mult[THR_D63_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2000;
- sf->thresh_mult[THR_I8X8_PRED] = 2000;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
- sf->thresh_mult[THR_NEWMV_FILT ] = 1000;
- sf->thresh_mult[THR_NEWG_FILT ] = 1000;
- sf->thresh_mult[THR_NEWA_FILT ] = 1000;
-#else
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
-
- sf->thresh_mult[THR_DC ] = 0;
-
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_D45_PRED ] = 1000;
- sf->thresh_mult[THR_D135_PRED] = 1000;
- sf->thresh_mult[THR_D117_PRED] = 1000;
- sf->thresh_mult[THR_D153_PRED] = 1000;
- sf->thresh_mult[THR_D27_PRED ] = 1000;
- sf->thresh_mult[THR_D63_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2000;
- sf->thresh_mult[THR_I8X8_PRED] = 2000;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
-#endif
- sf->thresh_mult[THR_SPLITMV ] = 2500;
- sf->thresh_mult[THR_SPLITG ] = 5000;
- sf->thresh_mult[THR_SPLITA ] = 5000;
-
- sf->thresh_mult[THR_COMP_ZEROLG ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
- sf->thresh_mult[THR_COMP_NEARLG ] = 0;
- sf->thresh_mult[THR_COMP_ZEROLA ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
- sf->thresh_mult[THR_COMP_NEARLA ] = 0;
- sf->thresh_mult[THR_COMP_ZEROGA ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
- sf->thresh_mult[THR_COMP_NEARGA ] = 0;
-
- sf->thresh_mult[THR_COMP_NEWLG ] = 1000;
- sf->thresh_mult[THR_COMP_NEWLA ] = 1000;
- sf->thresh_mult[THR_COMP_NEWGA ] = 1000;
-
- sf->thresh_mult[THR_COMP_SPLITLA ] = 2500;
- sf->thresh_mult[THR_COMP_SPLITGA ] = 5000;
- sf->thresh_mult[THR_COMP_SPLITLG ] = 5000;
-
- sf->first_step = 0;
- sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
- sf->search_best_filter = SEARCH_BEST_FILTER;
- break;
- case 1:
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARMV_FILT ] = 0;
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_D45_PRED ] = 1000;
- sf->thresh_mult[THR_D135_PRED] = 1000;
- sf->thresh_mult[THR_D117_PRED] = 1000;
- sf->thresh_mult[THR_D153_PRED] = 1000;
- sf->thresh_mult[THR_D27_PRED ] = 1000;
- sf->thresh_mult[THR_D63_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2500;
- sf->thresh_mult[THR_I8X8_PRED] = 2500;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEARESTG ] = 1000;
- sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;
- sf->thresh_mult[THR_NEARESTA ] = 1000;
- sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;
-
- sf->thresh_mult[THR_ZEROG ] = 1000;
- sf->thresh_mult[THR_ZEROA ] = 1000;
- sf->thresh_mult[THR_NEARG ] = 1000;
- sf->thresh_mult[THR_NEARA ] = 1000;
- sf->thresh_mult[THR_ZEROG_FILT ] = 1000;
- sf->thresh_mult[THR_ZEROA_FILT ] = 1000;
- sf->thresh_mult[THR_NEARG_FILT ] = 1000;
- sf->thresh_mult[THR_NEARA_FILT ] = 1000;
-
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
- sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
- sf->thresh_mult[THR_ZEROG_FILT ] = 0;
- sf->thresh_mult[THR_ZEROA_FILT ] = 0;
- sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
- sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
- sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
- sf->thresh_mult[THR_NEARMV_FILT ] = 0;
- sf->thresh_mult[THR_NEARG_FILT ] = 0;
- sf->thresh_mult[THR_NEARA_FILT ] = 0;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
- sf->thresh_mult[THR_NEWMV_FILT ] = 1000;
- sf->thresh_mult[THR_NEWG_FILT ] = 1000;
- sf->thresh_mult[THR_NEWA_FILT ] = 1000;
-#else
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_V_PRED ] = 1000;
- sf->thresh_mult[THR_H_PRED ] = 1000;
- sf->thresh_mult[THR_D45_PRED ] = 1000;
- sf->thresh_mult[THR_D135_PRED] = 1000;
- sf->thresh_mult[THR_D117_PRED] = 1000;
- sf->thresh_mult[THR_D153_PRED] = 1000;
- sf->thresh_mult[THR_D27_PRED ] = 1000;
- sf->thresh_mult[THR_D63_PRED ] = 1000;
- sf->thresh_mult[THR_B_PRED ] = 2500;
- sf->thresh_mult[THR_I8X8_PRED] = 2500;
- sf->thresh_mult[THR_TM ] = 1000;
-
- sf->thresh_mult[THR_NEARESTG ] = 1000;
- sf->thresh_mult[THR_NEARESTA ] = 1000;
-
- sf->thresh_mult[THR_ZEROG ] = 1000;
- sf->thresh_mult[THR_ZEROA ] = 1000;
- sf->thresh_mult[THR_NEARG ] = 1000;
- sf->thresh_mult[THR_NEARA ] = 1000;
-
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
- sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 0;
- sf->thresh_mult[THR_NEARG ] = 0;
- sf->thresh_mult[THR_NEARA ] = 0;
-
- sf->thresh_mult[THR_NEWMV ] = 1000;
- sf->thresh_mult[THR_NEWG ] = 1000;
- sf->thresh_mult[THR_NEWA ] = 1000;
-#endif
- sf->thresh_mult[THR_SPLITMV ] = 1700;
- sf->thresh_mult[THR_SPLITG ] = 4500;
- sf->thresh_mult[THR_SPLITA ] = 4500;
-
- sf->thresh_mult[THR_COMP_ZEROLG ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
- sf->thresh_mult[THR_COMP_NEARLG ] = 0;
- sf->thresh_mult[THR_COMP_ZEROLA ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
- sf->thresh_mult[THR_COMP_NEARLA ] = 0;
- sf->thresh_mult[THR_COMP_ZEROGA ] = 0;
- sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
- sf->thresh_mult[THR_COMP_NEARGA ] = 0;
-
- sf->thresh_mult[THR_COMP_NEWLG ] = 1000;
- sf->thresh_mult[THR_COMP_NEWLA ] = 1000;
- sf->thresh_mult[THR_COMP_NEWGA ] = 1000;
-
- sf->thresh_mult[THR_COMP_SPLITLA ] = 1700;
- sf->thresh_mult[THR_COMP_SPLITGA ] = 4500;
- sf->thresh_mult[THR_COMP_SPLITLG ] = 4500;
-
- if (Speed > 0) {
- /* Disable coefficient optimization above speed 0 */
- sf->optimize_coefficients = 0;
- sf->no_skip_block4x4_search = 0;
-
- sf->first_step = 1;
-
- cpi->mode_check_freq[THR_SPLITG] = 2;
- cpi->mode_check_freq[THR_SPLITA] = 2;
- cpi->mode_check_freq[THR_SPLITMV] = 0;
-
- cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
- cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
- cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
- }
-
- if (Speed > 1) {
- cpi->mode_check_freq[THR_SPLITG] = 4;
- cpi->mode_check_freq[THR_SPLITA] = 4;
- cpi->mode_check_freq[THR_SPLITMV] = 2;
-
- cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
- cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
- cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-
- sf->thresh_mult[THR_TM ] = 1500;
- sf->thresh_mult[THR_V_PRED ] = 1500;
- sf->thresh_mult[THR_H_PRED ] = 1500;
- sf->thresh_mult[THR_D45_PRED ] = 1500;
- sf->thresh_mult[THR_D135_PRED] = 1500;
- sf->thresh_mult[THR_D117_PRED] = 1500;
- sf->thresh_mult[THR_D153_PRED] = 1500;
- sf->thresh_mult[THR_D27_PRED ] = 1500;
- sf->thresh_mult[THR_D63_PRED ] = 1500;
- sf->thresh_mult[THR_B_PRED ] = 5000;
- sf->thresh_mult[THR_I8X8_PRED] = 5000;
-
- if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- sf->thresh_mult[THR_NEWMV ] = 2000;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEWMV_FILT ] = 2000;
-#endif
- sf->thresh_mult[THR_SPLITMV ] = 10000;
- sf->thresh_mult[THR_COMP_SPLITLG ] = 20000;
- }
-
- if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- sf->thresh_mult[THR_NEARESTG ] = 1500;
- sf->thresh_mult[THR_ZEROG ] = 1500;
- sf->thresh_mult[THR_NEARG ] = 1500;
- sf->thresh_mult[THR_NEWG ] = 2000;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;
- sf->thresh_mult[THR_ZEROG_FILT ] = 1500;
- sf->thresh_mult[THR_NEARG_FILT ] = 1500;
- sf->thresh_mult[THR_NEWG_FILT ] = 2000;
-#endif
- sf->thresh_mult[THR_SPLITG ] = 20000;
- sf->thresh_mult[THR_COMP_SPLITGA ] = 20000;
- }
-
- if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
- sf->thresh_mult[THR_NEARESTA ] = 1500;
- sf->thresh_mult[THR_ZEROA ] = 1500;
- sf->thresh_mult[THR_NEARA ] = 1500;
- sf->thresh_mult[THR_NEWA ] = 2000;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;
- sf->thresh_mult[THR_ZEROA_FILT ] = 1500;
- sf->thresh_mult[THR_NEARA_FILT ] = 1500;
- sf->thresh_mult[THR_NEWA_FILT ] = 2000;
-#endif
- sf->thresh_mult[THR_SPLITA ] = 20000;
- sf->thresh_mult[THR_COMP_SPLITLA ] = 10000;
- }
-
- sf->thresh_mult[THR_COMP_ZEROLG ] = 1500;
- sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;
- sf->thresh_mult[THR_COMP_NEARLG ] = 1500;
- sf->thresh_mult[THR_COMP_ZEROLA ] = 1500;
- sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;
- sf->thresh_mult[THR_COMP_NEARLA ] = 1500;
- sf->thresh_mult[THR_COMP_ZEROGA ] = 1500;
- sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;
- sf->thresh_mult[THR_COMP_NEARGA ] = 1500;
-
- sf->thresh_mult[THR_COMP_NEWLG ] = 2000;
- sf->thresh_mult[THR_COMP_NEWLA ] = 2000;
- sf->thresh_mult[THR_COMP_NEWGA ] = 2000;
- }
-
- if (Speed > 2) {
- cpi->mode_check_freq[THR_SPLITG] = 15;
- cpi->mode_check_freq[THR_SPLITA] = 15;
- cpi->mode_check_freq[THR_SPLITMV] = 7;
-
- cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
- cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
- cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
-
- sf->thresh_mult[THR_TM ] = 2000;
- sf->thresh_mult[THR_V_PRED ] = 2000;
- sf->thresh_mult[THR_H_PRED ] = 2000;
- sf->thresh_mult[THR_D45_PRED ] = 2000;
- sf->thresh_mult[THR_D135_PRED] = 2000;
- sf->thresh_mult[THR_D117_PRED] = 2000;
- sf->thresh_mult[THR_D153_PRED] = 2000;
- sf->thresh_mult[THR_D27_PRED ] = 2000;
- sf->thresh_mult[THR_D63_PRED ] = 2000;
- sf->thresh_mult[THR_B_PRED ] = 7500;
- sf->thresh_mult[THR_I8X8_PRED] = 7500;
-
- if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- sf->thresh_mult[THR_NEWMV ] = 2000;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEWMV_FILT ] = 2000;
-#endif
- sf->thresh_mult[THR_SPLITMV ] = 25000;
- sf->thresh_mult[THR_COMP_SPLITLG ] = 50000;
- }
-
- if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- sf->thresh_mult[THR_NEARESTG ] = 2000;
- sf->thresh_mult[THR_ZEROG ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEWG ] = 2500;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;
- sf->thresh_mult[THR_ZEROG_FILT ] = 2000;
- sf->thresh_mult[THR_NEARG_FILT ] = 2000;
- sf->thresh_mult[THR_NEWG_FILT ] = 2500;
-#endif
- sf->thresh_mult[THR_SPLITG ] = 50000;
- sf->thresh_mult[THR_COMP_SPLITGA ] = 50000;
- }
-
- if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
- sf->thresh_mult[THR_NEARESTA ] = 2000;
- sf->thresh_mult[THR_ZEROA ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
- sf->thresh_mult[THR_NEWA ] = 2500;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;
- sf->thresh_mult[THR_ZEROA_FILT ] = 2000;
- sf->thresh_mult[THR_NEARA_FILT ] = 2000;
- sf->thresh_mult[THR_NEWA_FILT ] = 2500;
-#endif
- sf->thresh_mult[THR_SPLITA ] = 50000;
- sf->thresh_mult[THR_COMP_SPLITLA ] = 25000;
- }
-
- sf->thresh_mult[THR_COMP_ZEROLG ] = 2000;
- sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;
- sf->thresh_mult[THR_COMP_NEARLG ] = 2000;
- sf->thresh_mult[THR_COMP_ZEROLA ] = 2000;
- sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
- sf->thresh_mult[THR_COMP_NEARLA ] = 2000;
- sf->thresh_mult[THR_COMP_ZEROGA ] = 2000;
- sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;
- sf->thresh_mult[THR_COMP_NEARGA ] = 2000;
-
- sf->thresh_mult[THR_COMP_NEWLG ] = 2500;
- sf->thresh_mult[THR_COMP_NEWLA ] = 2500;
- sf->thresh_mult[THR_COMP_NEWGA ] = 2500;
-
- sf->improved_dct = 0;
-
- // Only do recode loop on key frames, golden frames and
- // alt ref frames
- sf->recode_loop = 2;
-
- }
-
- break;
-
- }; /* switch */
-
- /* disable frame modes if flags not set */
- if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
- sf->thresh_mult[THR_NEWMV ] = INT_MAX;
- sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
- sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
- sf->thresh_mult[THR_NEARMV ] = INT_MAX;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEWMV_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;
- sf->thresh_mult[THR_ZEROMV_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEARMV_FILT ] = INT_MAX;
-#endif
- sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
- }
-
- if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
- sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
- sf->thresh_mult[THR_ZEROG ] = INT_MAX;
- sf->thresh_mult[THR_NEARG ] = INT_MAX;
- sf->thresh_mult[THR_NEWG ] = INT_MAX;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;
- sf->thresh_mult[THR_ZEROG_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEARG_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEWG_FILT ] = INT_MAX;
-#endif
- sf->thresh_mult[THR_SPLITG ] = INT_MAX;
- }
-
- if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
- sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
- sf->thresh_mult[THR_ZEROA ] = INT_MAX;
- sf->thresh_mult[THR_NEARA ] = INT_MAX;
- sf->thresh_mult[THR_NEWA ] = INT_MAX;
-#if CONFIG_PRED_FILTER
- sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;
- sf->thresh_mult[THR_ZEROA_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEARA_FILT ] = INT_MAX;
- sf->thresh_mult[THR_NEWA_FILT ] = INT_MAX;
-#endif
- sf->thresh_mult[THR_SPLITA ] = INT_MAX;
- }
-
- if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
- sf->thresh_mult[THR_COMP_ZEROLG ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARLG ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEWLG ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITLG ] = INT_MAX;
- }
-
- if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
- sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
- }
-
- if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
- sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX;
- sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
- }
-
- // Slow quant, dct and trellis not worthwhile for first pass
- // so make sure they are always turned off.
- if (cpi->pass == 1) {
- sf->optimize_coefficients = 0;
- sf->improved_dct = 0;
- }
-
- if (cpi->sf.search_method == NSTEP) {
- vp9_init3smotion_compensation(&cpi->mb,
- cm->yv12_fb[cm->lst_fb_idx].y_stride);
- } else if (cpi->sf.search_method == DIAMOND) {
- vp9_init_dsmotion_compensation(&cpi->mb,
- cm->yv12_fb[cm->lst_fb_idx].y_stride);
- }
-
- cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
- cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
- cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
- cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
- cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
- cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
-
-#if CONFIG_LOSSLESS
- if (cpi->oxcf.lossless) {
- cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
- cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
- cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
- cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
- cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
- }
-#endif
-
-
-
- cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4;
- cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
- cpi->mb.quantize_b_8x8 = vp9_regular_quantize_b_8x8;
- cpi->mb.quantize_b_16x16 = vp9_regular_quantize_b_16x16;
- cpi->mb.quantize_b_2x2 = vp9_regular_quantize_b_2x2;
-
- vp9_init_quantizer(cpi);
-
-#if CONFIG_RUNTIME_CPU_DETECT
- cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
-#endif
-
- if (cpi->sf.iterative_sub_pixel == 1) {
- cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
- } else if (cpi->sf.quarter_pixel_search) {
- cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
- } else if (cpi->sf.half_pixel_search) {
- cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
- }
-
- if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)
- cpi->mb.optimize = 1;
- else
- cpi->mb.optimize = 0;
-
-#ifdef SPEEDSTATS
- frames_at_speed[cpi->Speed]++;
-#endif
-}
-static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
- int width = (cpi->oxcf.Width + 15) & ~15;
- int height = (cpi->oxcf.Height + 15) & ~15;
-
- cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
- cpi->oxcf.lag_in_frames);
- if (!cpi->lookahead)
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate lag buffers");
-
-#if VP9_TEMPORAL_ALT_REF
-
- if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
- width, height, VP8BORDERINPIXELS))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate altref buffer");
-
-#endif
-}
-
-static int alloc_partition_data(VP9_COMP *cpi) {
- vpx_free(cpi->mb.pip);
-
- cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
- (cpi->common.mb_rows + 1),
- sizeof(PARTITION_INFO));
- if (!cpi->mb.pip)
- return 1;
-
- cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
- return 0;
-}
-
-void vp9_alloc_compressor_data(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- int width = cm->Width;
- int height = cm->Height;
-
- if (vp9_alloc_frame_buffers(cm, width, height))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate frame buffers");
-
- if (alloc_partition_data(cpi))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate partition data");
-
-
- if ((width & 0xf) != 0)
- width += 16 - (width & 0xf);
-
- if ((height & 0xf) != 0)
- height += 16 - (height & 0xf);
-
-
- if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
- width, height, VP8BORDERINPIXELS))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate last frame buffer");
-
- if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
- width, height, VP8BORDERINPIXELS))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate scaled source buffer");
-
-
- vpx_free(cpi->tok);
-
- {
- unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
-
- CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
- }
-
- // Data used for real time vc mode to see if gf needs refreshing
- cpi->inter_zz_count = 0;
- cpi->gf_bad_count = 0;
- cpi->gf_update_recommended = 0;
-
-
- // Structures used to minitor GF usage
- vpx_free(cpi->gf_active_flags);
- CHECK_MEM_ERROR(cpi->gf_active_flags,
- vpx_calloc(1, cm->mb_rows * cm->mb_cols));
- cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
- vpx_free(cpi->mb_activity_map);
- CHECK_MEM_ERROR(cpi->mb_activity_map,
- vpx_calloc(sizeof(unsigned int),
- cm->mb_rows * cm->mb_cols));
-
- vpx_free(cpi->mb_norm_activity_map);
- CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
- vpx_calloc(sizeof(unsigned int),
- cm->mb_rows * cm->mb_cols));
-
- vpx_free(cpi->twopass.total_stats);
-
- cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- vpx_free(cpi->twopass.total_left_stats);
- cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- vpx_free(cpi->twopass.this_frame_stats);
-
- cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
- if (!cpi->twopass.total_stats ||
- !cpi->twopass.total_left_stats ||
- !cpi->twopass.this_frame_stats)
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate firstpass stats");
-
- vpx_free(cpi->tplist);
-
- CHECK_MEM_ERROR(cpi->tplist,
- vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
-}
-
-
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
-// Table that converts 0-63 Q range values passed in outside to the Qindex
-// range used internally.
-static const int q_trans[] = {
- 0, 4, 8, 12, 16, 20, 24, 28,
- 32, 36, 40, 44, 48, 52, 56, 60,
- 64, 68, 72, 76, 80, 84, 88, 92,
- 96, 100, 104, 108, 112, 116, 120, 124,
- 128, 132, 136, 140, 144, 148, 152, 156,
- 160, 164, 168, 172, 176, 180, 184, 188,
- 192, 196, 200, 204, 208, 212, 216, 220,
- 224, 228, 232, 236, 240, 244, 249, 255,
-};
-
-int vp9_reverse_trans(int x) {
- int i;
-
- for (i = 0; i < 64; i++)
- if (q_trans[i] >= x)
- return i;
-
- return 63;
-};
-void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
- if (framerate < .1)
- framerate = 30;
-
- cpi->oxcf.frame_rate = framerate;
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
- cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
- cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
- cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
-
- if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)
- cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
-
- // Set Maximum gf/arf interval
- cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
-
- if (cpi->max_gf_interval < 12)
- cpi->max_gf_interval = 12;
-
- // Extended interval for genuinely static scenes
- cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
-
- // Special conditions when altr ref frame enabled in lagged compress mode
- if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
- if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
- cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
-
- if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
- cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
- }
-
- if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
- cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
-}
-
-
-static int
-rescale(int val, int num, int denom) {
- int64_t llnum = num;
- int64_t llden = denom;
- int64_t llval = val;
-
- return llval * llnum / llden;
-}
-
-
-static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
- VP9_COMMON *cm = &cpi->common;
-
- cpi->oxcf = *oxcf;
-
- cpi->goldfreq = 7;
-
- cm->version = oxcf->Version;
- vp9_setup_version(cm);
-
- // change includes all joint functionality
- vp9_change_config(ptr, oxcf);
-
- // Initialize active best and worst q and average q values.
- cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->active_best_quality = cpi->oxcf.best_allowed_q;
- cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
-
- // Initialise the starting buffer levels
- cpi->buffer_level = cpi->oxcf.starting_buffer_level;
- cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
-
- cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
- cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
-
- cpi->total_actual_bits = 0;
- cpi->total_target_vs_actual = 0;
-
- cpi->static_mb_pct = 0;
-
-#if VP9_TEMPORAL_ALT_REF
- {
- int i;
-
- cpi->fixed_divide[0] = 0;
-
- for (i = 1; i < 512; i++)
- cpi->fixed_divide[i] = 0x80000 / i;
- }
-#endif
-}
-
-
-void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
- VP9_COMMON *cm = &cpi->common;
-
- if (!cpi)
- return;
-
- if (!oxcf)
- return;
-
- if (cm->version != oxcf->Version) {
- cm->version = oxcf->Version;
- vp9_setup_version(cm);
- }
-
- cpi->oxcf = *oxcf;
-
- switch (cpi->oxcf.Mode) {
- // Real time and one pass deprecated in test code base
- case MODE_FIRSTPASS:
- cpi->pass = 1;
- cpi->compressor_speed = 1;
- break;
-
- case MODE_SECONDPASS:
- cpi->pass = 2;
- cpi->compressor_speed = 1;
-
- if (cpi->oxcf.cpu_used < -5) {
- cpi->oxcf.cpu_used = -5;
- }
-
- if (cpi->oxcf.cpu_used > 5)
- cpi->oxcf.cpu_used = 5;
-
- break;
-
- case MODE_SECONDPASS_BEST:
- cpi->pass = 2;
- cpi->compressor_speed = 0;
- break;
- }
-
- cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
- cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
- cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-#if CONFIG_LOSSLESS
- cpi->oxcf.lossless = oxcf->lossless;
- if (cpi->oxcf.lossless) {
- cpi->common.rtcd.idct.idct1 = vp9_short_inv_walsh4x4_1_x8_c;
- cpi->common.rtcd.idct.idct16 = vp9_short_inv_walsh4x4_x8_c;
- cpi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_inv_walsh_add_c;
- cpi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
- cpi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_lossless_c;
- }
-#endif
-
- cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
- cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
- // cpi->use_golden_frame_only = 0;
- // cpi->use_last_frame_only = 0;
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 1;
- cm->refresh_entropy_probs = 1;
-
- setup_features(cpi);
- cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation
-
- {
- int i;
-
- for (i = 0; i < MAX_MB_SEGMENTS; i++)
- cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
- }
-
- // At the moment the first order values may not be > MAXQ
- if (cpi->oxcf.fixed_q > MAXQ)
- cpi->oxcf.fixed_q = MAXQ;
-
- // local file playback mode == really big buffer
- if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
- cpi->oxcf.starting_buffer_level = 60000;
- cpi->oxcf.optimal_buffer_level = 60000;
- cpi->oxcf.maximum_buffer_size = 240000;
- }
-
- // Convert target bandwidth from Kbit/s to Bit/s
- cpi->oxcf.target_bandwidth *= 1000;
-
- cpi->oxcf.starting_buffer_level =
- rescale(cpi->oxcf.starting_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
-
- // Set or reset optimal and maximum buffer levels.
- if (cpi->oxcf.optimal_buffer_level == 0)
- cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
- else
- cpi->oxcf.optimal_buffer_level =
- rescale(cpi->oxcf.optimal_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
-
- if (cpi->oxcf.maximum_buffer_size == 0)
- cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
- else
- cpi->oxcf.maximum_buffer_size =
- rescale(cpi->oxcf.maximum_buffer_size,
- cpi->oxcf.target_bandwidth, 1000);
-
- // Set up frame rate and related parameters rate control values.
- vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-
- // Set absolute upper and lower quality limits
- cpi->worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->best_quality = cpi->oxcf.best_allowed_q;
-
- // active values should only be modified if out of new range
- if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {
- cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
- }
- // less likely
- else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {
- cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
- }
- if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {
- cpi->active_best_quality = cpi->oxcf.best_allowed_q;
- }
- // less likely
- else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {
- cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
- }
-
- cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
- cpi->cq_target_quality = cpi->oxcf.cq_level;
-
- if (!cm->use_bilinear_mc_filter)
- cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
- else
- cm->mcomp_filter_type = BILINEAR;
-
- cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
- cm->Width = cpi->oxcf.Width;
- cm->Height = cpi->oxcf.Height;
-
- cm->horiz_scale = cpi->horiz_scale;
- cm->vert_scale = cpi->vert_scale;
-
- // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
- if (cpi->oxcf.Sharpness > 7)
- cpi->oxcf.Sharpness = 7;
-
- cm->sharpness_level = cpi->oxcf.Sharpness;
-
- if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
- int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
- int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
- Scale2Ratio(cm->horiz_scale, &hr, &hs);
- Scale2Ratio(cm->vert_scale, &vr, &vs);
-
- // always go to the next whole number
- cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
- cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
- }
-
- if (((cm->Width + 15) & 0xfffffff0) !=
- cm->yv12_fb[cm->lst_fb_idx].y_width ||
- ((cm->Height + 15) & 0xfffffff0) !=
- cm->yv12_fb[cm->lst_fb_idx].y_height ||
- cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {
- alloc_raw_frame_buffers(cpi);
- vp9_alloc_compressor_data(cpi);
- }
-
- if (cpi->oxcf.fixed_q >= 0) {
- cpi->last_q[0] = cpi->oxcf.fixed_q;
- cpi->last_q[1] = cpi->oxcf.fixed_q;
- cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
- }
-
- cpi->Speed = cpi->oxcf.cpu_used;
-
- // force to allowlag to 0 if lag_in_frames is 0;
- if (cpi->oxcf.lag_in_frames == 0) {
- cpi->oxcf.allow_lag = 0;
- }
- // Limit on lag buffers as these are not currently dynamically allocated
- else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
- cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
- // YX Temp
- cpi->alt_ref_source = NULL;
- cpi->is_src_frame_alt_ref = 0;
-
-#if 0
- // Experimental RD Code
- cpi->frame_distortion = 0;
- cpi->last_frame_distortion = 0;
-#endif
-
-}
-
-#define M_LOG2_E 0.693147180559945309417
-#define log2f(x) (log (x) / (float) M_LOG2_E)
-
-static void cal_nmvjointsadcost(int *mvjointsadcost) {
- mvjointsadcost[0] = 600;
- mvjointsadcost[1] = 300;
- mvjointsadcost[2] = 300;
- mvjointsadcost[0] = 300;
-}
-
-static void cal_nmvsadcosts(int *mvsadcost[2]) {
- int i = 1;
-
- mvsadcost [0] [0] = 0;
- mvsadcost [1] [0] = 0;
-
- do {
- double z = 256 * (2 * (log2f(8 * i) + .6));
- mvsadcost [0][i] = (int) z;
- mvsadcost [1][i] = (int) z;
- mvsadcost [0][-i] = (int) z;
- mvsadcost [1][-i] = (int) z;
- } while (++i <= MV_MAX);
-}
-
-static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
- int i = 1;
-
- mvsadcost [0] [0] = 0;
- mvsadcost [1] [0] = 0;
-
- do {
- double z = 256 * (2 * (log2f(8 * i) + .6));
- mvsadcost [0][i] = (int) z;
- mvsadcost [1][i] = (int) z;
- mvsadcost [0][-i] = (int) z;
- mvsadcost [1][-i] = (int) z;
- } while (++i <= MV_MAX);
-}
-
-VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
- int i;
- volatile union {
- VP9_COMP *cpi;
- VP9_PTR ptr;
- } ctx;
-
- VP9_COMP *cpi;
- VP9_COMMON *cm;
-
- cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
- // Check that the CPI instance is valid
- if (!cpi)
- return 0;
-
- cm = &cpi->common;
-
- vpx_memset(cpi, 0, sizeof(VP9_COMP));
-
- if (setjmp(cm->error.jmp)) {
- VP9_PTR ptr = ctx.ptr;
-
- ctx.cpi->common.error.setjmp = 0;
- vp9_remove_compressor(&ptr);
- return 0;
- }
-
- cpi->common.error.setjmp = 1;
-
- CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
-
- vp9_create_common(&cpi->common);
- vp9_cmachine_specific_config(cpi);
-
- init_config((VP9_PTR)cpi, oxcf);
-
- memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));
- cpi->common.current_video_frame = 0;
- cpi->kf_overspend_bits = 0;
- cpi->kf_bitrate_adjustment = 0;
- cpi->frames_till_gf_update_due = 0;
- cpi->gf_overspend_bits = 0;
- cpi->non_gf_bitrate_adjustment = 0;
- cm->prob_last_coded = 128;
- cm->prob_gf_coded = 128;
- cm->prob_intra_coded = 63;
-#if CONFIG_SUPERBLOCKS
- cm->sb_coded = 200;
-#endif
- for (i = 0; i < COMP_PRED_CONTEXTS; i++)
- cm->prob_comppred[i] = 128;
- for (i = 0; i < TX_SIZE_MAX - 1; i++)
- cm->prob_tx[i] = 128;
-
- // Prime the recent reference frame useage counters.
- // Hereafter they will be maintained as a sort of moving average
- cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
- cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
- cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
- cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-
- // Set reference frame sign bias for ALTREF frame to 1 (for now)
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
-
- cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
- cpi->gold_is_last = 0;
- cpi->alt_is_last = 0;
- cpi->gold_is_alt = 0;
-
- // allocate memory for storing last frame's MVs for MV prediction.
- CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));
- CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
- CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
-
- // Create the encoder segmentation map and set all entries to 0
- CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
- // And a copy in common for temporal coding
- CHECK_MEM_ERROR(cm->last_frame_seg_map,
- vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
- // And a place holder structure is the coding context
- // for use if we want to save and restore it
- CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
- vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
-
- CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
- vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
- cpi->active_map_enabled = 0;
-
- for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
- sizeof(cpi->mbgraph_stats[0])); i++) {
- CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
- vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
- sizeof(*cpi->mbgraph_stats[i].mb_stats),
- 1));
- }
-
-#ifdef ENTROPY_STATS
- if (cpi->pass != 1)
- init_context_counters();
-#endif
-#ifdef MODE_STATS
- vp9_zero(y_modes);
- vp9_zero(i8x8_modes);
- vp9_zero(uv_modes);
- vp9_zero(uv_modes_y);
- vp9_zero(b_modes);
- vp9_zero(inter_y_modes);
- vp9_zero(inter_uv_modes);
- vp9_zero(inter_b_modes);
-#endif
-#ifdef NMV_STATS
- init_nmvstats();
-#endif
-
- /*Initialize the feed-forward activity masking.*/
- cpi->activity_avg = 90 << 12;
-
- cpi->frames_since_key = 8; // Give a sensible default for the first frame.
- cpi->key_frame_frequency = cpi->oxcf.key_freq;
- cpi->this_key_frame_forced = FALSE;
- cpi->next_key_frame_forced = FALSE;
-
- cpi->source_alt_ref_pending = FALSE;
- cpi->source_alt_ref_active = FALSE;
- cpi->common.refresh_alt_ref_frame = 0;
-
- cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
- cpi->b_calculate_ssimg = 0;
-
- cpi->count = 0;
- cpi->bytes = 0;
-
- if (cpi->b_calculate_psnr) {
- cpi->total_sq_error = 0.0;
- cpi->total_sq_error2 = 0.0;
- cpi->total_y = 0.0;
- cpi->total_u = 0.0;
- cpi->total_v = 0.0;
- cpi->total = 0.0;
- cpi->totalp_y = 0.0;
- cpi->totalp_u = 0.0;
- cpi->totalp_v = 0.0;
- cpi->totalp = 0.0;
- cpi->tot_recode_hits = 0;
- cpi->summed_quality = 0;
- cpi->summed_weights = 0;
- }
-
- if (cpi->b_calculate_ssimg) {
- cpi->total_ssimg_y = 0;
- cpi->total_ssimg_u = 0;
- cpi->total_ssimg_v = 0;
- cpi->total_ssimg_all = 0;
- }
-
-#endif
-
-#ifndef LLONG_MAX
-#define LLONG_MAX 9223372036854775807LL
-#endif
- cpi->first_time_stamp_ever = LLONG_MAX;
-
- cpi->frames_till_gf_update_due = 0;
- cpi->key_frame_count = 1;
-
- cpi->ni_av_qi = cpi->oxcf.worst_allowed_q;
- cpi->ni_tot_qi = 0;
- cpi->ni_frames = 0;
- cpi->tot_q = 0.0;
- cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
- cpi->total_byte_count = 0;
-
- cpi->rate_correction_factor = 1.0;
- cpi->key_frame_rate_correction_factor = 1.0;
- cpi->gf_rate_correction_factor = 1.0;
- cpi->twopass.est_max_qcorrection_factor = 1.0;
-
- cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
- cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
- cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
- cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
- cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
- cal_nmvsadcosts(cpi->mb.nmvsadcost);
-
- cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
- cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
- cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
- cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
- cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
-
- for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
- cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
- }
-
-#ifdef OUTPUT_YUV_SRC
- yuv_file = fopen("bd.yuv", "ab");
-#endif
-#ifdef OUTPUT_YUV_REC
- yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-
-#if 0
- framepsnr = fopen("framepsnr.stt", "a");
- kf_list = fopen("kf_list.stt", "w");
-#endif
-
- cpi->output_pkt_list = oxcf->output_pkt_list;
-
- if (cpi->pass == 1) {
- vp9_init_first_pass(cpi);
- } else if (cpi->pass == 2) {
- size_t packet_sz = sizeof(FIRSTPASS_STATS);
- int packets = oxcf->two_pass_stats_in.sz / packet_sz;
-
- cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
- cpi->twopass.stats_in = cpi->twopass.stats_in_start;
- cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
- + (packets - 1) * packet_sz);
- vp9_init_second_pass(cpi);
- }
-
- vp9_set_speed_features(cpi);
-
- // Set starting values of RD threshold multipliers (128 = *1)
- for (i = 0; i < MAX_MODES; i++) {
- cpi->rd_thresh_mult[i] = 128;
- }
-
-#ifdef ENTROPY_STATS
- init_mv_ref_counts();
-#endif
-
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \
- cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \
- cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
- cpi->fn_ptr[BT].sdx3f = SDX3F; \
- cpi->fn_ptr[BT].sdx8f = SDX8F; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
-
-#if CONFIG_SUPERBLOCKS
- BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
- vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
- vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
- vp9_sad32x32x4d)
-#endif
-
- BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
- vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
- vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
- vp9_sad16x16x4d)
-
- BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
- NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
-
- BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
- NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
-
- BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
- NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
-
- BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
- NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
-
-#if ARCH_X86 || ARCH_X86_64
- cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn;
-#endif
-
- cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
- cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
- cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
-
- // make sure frame 1 is okay
- cpi->error_bins[0] = cpi->common.MBs;
-
- /* vp9_init_quantizer() is first called here. Add check in
- * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
- * called later when needed. This will avoid unnecessary calls of
- * vp9_init_quantizer() for every frame.
- */
- vp9_init_quantizer(cpi);
-
- vp9_loop_filter_init(cm);
-
- cpi->common.error.setjmp = 0;
-
- vp9_zero(cpi->y_uv_mode_count)
-
- return (VP9_PTR) cpi;
-}
-
-void vp9_remove_compressor(VP9_PTR *ptr) {
- VP9_COMP *cpi = (VP9_COMP *)(*ptr);
- int i;
-
- if (!cpi)
- return;
-
- if (cpi && (cpi->common.current_video_frame > 0)) {
- if (cpi->pass == 2) {
- vp9_end_second_pass(cpi);
- }
-
-#ifdef ENTROPY_STATS
- if (cpi->pass != 1) {
- print_context_counters();
- print_tree_update_probs();
- print_mode_context();
- }
-#endif
-#ifdef NMV_STATS
- if (cpi->pass != 1)
- print_nmvstats();
-#endif
-
-#if CONFIG_INTERNAL_STATS
-
- vp9_clear_system_state();
-
- // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
- if (cpi->pass != 1) {
- FILE *f = fopen("opsnr.stt", "a");
- double time_encoded = (cpi->last_end_time_stamp_seen
- - cpi->first_time_stamp_ever) / 10000000.000;
- double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
- double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
-#if defined(MODE_STATS)
- print_mode_contexts(&cpi->common);
-#endif
- if (cpi->b_calculate_psnr) {
- YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
- double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
- double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
- double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
- double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-
- fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(ms)\n");
- fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
- dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
- total_encode_time);
-// fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-// dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
-// total_encode_time, cpi->tot_recode_hits);
- }
-
- if (cpi->b_calculate_ssimg) {
- fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n");
- fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
- cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
- cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-// fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f %10ld\n", dr,
-// cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-// cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
- }
-
- fclose(f);
- }
-
-#endif
-
-
-#ifdef MODE_STATS
- {
- extern int count_mb_seg[4];
- char modes_stats_file[250];
- FILE *f;
- double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;
- sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);
- f = fopen(modes_stats_file, "w");
- fprintf(f, "intra_mode in Intra Frames:\n");
- {
- int i;
- fprintf(f, "Y: ");
- for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);
- fprintf(f, "\n");
- }
- {
- int i;
- fprintf(f, "I8: ");
- for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);
- fprintf(f, "\n");
- }
- {
- int i;
- fprintf(f, "UV: ");
- for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);
- fprintf(f, "\n");
- }
- {
- int i, j;
- fprintf(f, "KeyFrame Y-UV:\n");
- for (i = 0; i < VP9_YMODES; i++) {
- fprintf(f, "%2d:", i);
- for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);
- fprintf(f, "\n");
- }
- }
- {
- int i, j;
- fprintf(f, "Inter Y-UV:\n");
- for (i = 0; i < VP9_YMODES; i++) {
- fprintf(f, "%2d:", i);
- for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);
- fprintf(f, "\n");
- }
- }
- {
- int i;
-
- fprintf(f, "B: ");
- for (i = 0; i < VP9_BINTRAMODES; i++)
- fprintf(f, "%8d, ", b_modes[i]);
-
- fprintf(f, "\n");
-
- }
-
- fprintf(f, "Modes in Inter Frames:\n");
- {
- int i;
- fprintf(f, "Y: ");
- for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);
- fprintf(f, "\n");
- }
- {
- int i;
- fprintf(f, "UV: ");
- for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);
- fprintf(f, "\n");
- }
- {
- int i;
- fprintf(f, "B: ");
- for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);
- fprintf(f, "\n");
- }
- fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
- fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
- fclose(f);
- }
-#endif
-
-#ifdef ENTROPY_STATS
- {
- int i, j, k;
- FILE *fmode = fopen("modecontext.c", "w");
-
- fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
- fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
- fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");
-
- for (i = 0; i < 10; i++) {
-
- fprintf(fmode, " { // Above Mode : %d\n", i);
-
- for (j = 0; j < 10; j++) {
-
- fprintf(fmode, " {");
-
- for (k = 0; k < VP9_BINTRAMODES; k++) {
- if (!intra_mode_stats[i][j][k])
- fprintf(fmode, " %5d, ", 1);
- else
- fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
- }
-
- fprintf(fmode, "}, // left_mode %d\n", j);
-
- }
-
- fprintf(fmode, " },\n");
-
- }
-
- fprintf(fmode, "};\n");
- fclose(fmode);
- }
-#endif
-
-
-#if defined(SECTIONBITS_OUTPUT)
-
- if (0) {
- int i;
- FILE *f = fopen("tokenbits.stt", "a");
-
- for (i = 0; i < 28; i++)
- fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
-
- fprintf(f, "\n");
- fclose(f);
- }
-
-#endif
-
-#if 0
- {
- printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
- printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
- printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
- }
-#endif
-
- }
-
- dealloc_compressor_data(cpi);
- vpx_free(cpi->mb.ss);
- vpx_free(cpi->tok);
-
- for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
- vpx_free(cpi->mbgraph_stats[i].mb_stats);
- }
-
- vp9_remove_common(&cpi->common);
- vpx_free(cpi);
- *ptr = 0;
-
-#ifdef OUTPUT_YUV_SRC
- fclose(yuv_file);
-#endif
-#ifdef OUTPUT_YUV_REC
- fclose(yuv_rec_file);
-#endif
-
-#if 0
-
- if (keyfile)
- fclose(keyfile);
-
- if (framepsnr)
- fclose(framepsnr);
-
- if (kf_list)
- fclose(kf_list);
-
-#endif
-
-}
-
-
-static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
- unsigned char *recon, int recon_stride,
- unsigned int cols, unsigned int rows) {
- unsigned int row, col;
- uint64_t total_sse = 0;
- int diff;
-
- for (row = 0; row + 16 <= rows; row += 16) {
- for (col = 0; col + 16 <= cols; col += 16) {
- unsigned int sse;
-
- vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
- total_sse += sse;
- }
-
- /* Handle odd-sized width */
- if (col < cols) {
- unsigned int border_row, border_col;
- unsigned char *border_orig = orig;
- unsigned char *border_recon = recon;
-
- for (border_row = 0; border_row < 16; border_row++) {
- for (border_col = col; border_col < cols; border_col++) {
- diff = border_orig[border_col] - border_recon[border_col];
- total_sse += diff * diff;
- }
-
- border_orig += orig_stride;
- border_recon += recon_stride;
- }
- }
-
- orig += orig_stride * 16;
- recon += recon_stride * 16;
- }
-
- /* Handle odd-sized height */
- for (; row < rows; row++) {
- for (col = 0; col < cols; col++) {
- diff = orig[col] - recon[col];
- total_sse += diff * diff;
- }
-
- orig += orig_stride;
- recon += recon_stride;
- }
-
- return total_sse;
-}
-
-
-static void generate_psnr_packet(VP9_COMP *cpi) {
- YV12_BUFFER_CONFIG *orig = cpi->Source;
- YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
- struct vpx_codec_cx_pkt pkt;
- uint64_t sse;
- int i;
- unsigned int width = cpi->common.Width;
- unsigned int height = cpi->common.Height;
-
- pkt.kind = VPX_CODEC_PSNR_PKT;
- sse = calc_plane_error(orig->y_buffer, orig->y_stride,
- recon->y_buffer, recon->y_stride,
- width, height);
- pkt.data.psnr.sse[0] = sse;
- pkt.data.psnr.sse[1] = sse;
- pkt.data.psnr.samples[0] = width * height;
- pkt.data.psnr.samples[1] = width * height;
-
- width = (width + 1) / 2;
- height = (height + 1) / 2;
-
- sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
- recon->u_buffer, recon->uv_stride,
- width, height);
- pkt.data.psnr.sse[0] += sse;
- pkt.data.psnr.sse[2] = sse;
- pkt.data.psnr.samples[0] += width * height;
- pkt.data.psnr.samples[2] = width * height;
-
- sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
- recon->v_buffer, recon->uv_stride,
- width, height);
- pkt.data.psnr.sse[0] += sse;
- pkt.data.psnr.sse[3] = sse;
- pkt.data.psnr.samples[0] += width * height;
- pkt.data.psnr.samples[3] = width * height;
-
- for (i = 0; i < 4; i++)
- pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
- pkt.data.psnr.sse[i]);
-
- vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
-}
-
-
-int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- if (ref_frame_flags > 7)
- return -1;
-
- cpi->ref_frame_flags = ref_frame_flags;
- return 0;
-}
-int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- if (ref_frame_flags > 7)
- return -1;
-
- cpi->common.refresh_golden_frame = 0;
- cpi->common.refresh_alt_ref_frame = 0;
- cpi->common.refresh_last_frame = 0;
-
- if (ref_frame_flags & VP9_LAST_FLAG)
- cpi->common.refresh_last_frame = 1;
-
- if (ref_frame_flags & VP9_GOLD_FLAG)
- cpi->common.refresh_golden_frame = 1;
-
- if (ref_frame_flags & VP9_ALT_FLAG)
- cpi->common.refresh_alt_ref_frame = 1;
-
- return 0;
-}
-
-int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
- VP9_COMMON *cm = &cpi->common;
- int ref_fb_idx;
-
- if (ref_frame_flag == VP9_LAST_FLAG)
- ref_fb_idx = cm->lst_fb_idx;
- else if (ref_frame_flag == VP9_GOLD_FLAG)
- ref_fb_idx = cm->gld_fb_idx;
- else if (ref_frame_flag == VP9_ALT_FLAG)
- ref_fb_idx = cm->alt_fb_idx;
- else
- return -1;
-
- vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
-
- return 0;
-}
-
-int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
- YV12_BUFFER_CONFIG *sd) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
- VP9_COMMON *cm = &cpi->common;
-
- int ref_fb_idx;
-
- if (ref_frame_flag == VP9_LAST_FLAG)
- ref_fb_idx = cm->lst_fb_idx;
- else if (ref_frame_flag == VP9_GOLD_FLAG)
- ref_fb_idx = cm->gld_fb_idx;
- else if (ref_frame_flag == VP9_ALT_FLAG)
- ref_fb_idx = cm->alt_fb_idx;
- else
- return -1;
-
- vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
-
- return 0;
-}
-int vp9_update_entropy(VP9_PTR comp, int update) {
- VP9_COMP *cpi = (VP9_COMP *) comp;
- VP9_COMMON *cm = &cpi->common;
- cm->refresh_entropy_probs = update;
-
- return 0;
-}
-
-
-#ifdef OUTPUT_YUV_SRC
-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
- unsigned char *src = s->y_buffer;
- int h = s->y_height;
-
- do {
- fwrite(src, s->y_width, 1, yuv_file);
- src += s->y_stride;
- } while (--h);
-
- src = s->u_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-
- src = s->v_buffer;
- h = s->uv_height;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_file);
- src += s->uv_stride;
- } while (--h);
-}
-#endif
-
-#ifdef OUTPUT_YUV_REC
-void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
- YV12_BUFFER_CONFIG *s = cm->frame_to_show;
- unsigned char *src = s->y_buffer;
- int h = cm->Height;
-
- do {
- fwrite(src, s->y_width, 1, yuv_rec_file);
- src += s->y_stride;
- } while (--h);
-
- src = s->u_buffer;
- h = (cm->Height + 1) / 2;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_rec_file);
- src += s->uv_stride;
- } while (--h);
-
- src = s->v_buffer;
- h = (cm->Height + 1) / 2;
-
- do {
- fwrite(src, s->uv_width, 1, yuv_rec_file);
- src += s->uv_stride;
- } while (--h);
-}
-#endif
-
-static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- // Update data structure that monitors level of reference to last GF
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
- cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
- // this frame refreshes means next frames don't unless specified by user
- cpi->common.frames_since_golden = 0;
-
- // Clear the alternate reference update pending flag.
- cpi->source_alt_ref_pending = FALSE;
-
- // Set the alternate refernce frame active flag
- cpi->source_alt_ref_active = TRUE;
-
-
-}
-static void update_golden_frame_stats(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- // Update the Golden frame usage counts.
- if (cm->refresh_golden_frame) {
- // Update data structure that monitors level of reference to last GF
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
- cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
- // this frame refreshes means next frames don't unless specified by user
- cm->refresh_golden_frame = 0;
- cpi->common.frames_since_golden = 0;
-
- // if ( cm->frame_type == KEY_FRAME )
- // {
- cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
- cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
- cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
- cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
- // }
- // else
- // {
- // // Carry a potrtion of count over to begining of next gf sequence
- // cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
- // cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
- // cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
- // cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
- // }
-
- // ******** Fixed Q test code only ************
- // If we are going to use the ALT reference for the next group of frames set a flag to say so.
- if (cpi->oxcf.fixed_q >= 0 &&
- cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {
- cpi->source_alt_ref_pending = TRUE;
- cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
- }
-
- if (!cpi->source_alt_ref_pending)
- cpi->source_alt_ref_active = FALSE;
-
- // Decrement count down till next gf
- if (cpi->frames_till_gf_update_due > 0)
- cpi->frames_till_gf_update_due--;
-
- } else if (!cpi->common.refresh_alt_ref_frame) {
- // Decrement count down till next gf
- if (cpi->frames_till_gf_update_due > 0)
- cpi->frames_till_gf_update_due--;
-
- if (cpi->common.frames_till_alt_ref_frame)
- cpi->common.frames_till_alt_ref_frame--;
-
- cpi->common.frames_since_golden++;
-
- if (cpi->common.frames_since_golden > 1) {
- cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
- cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
- cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
- cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
- }
- }
-}
-
-static int find_fp_qindex() {
- int i;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (vp9_convert_qindex_to_q(i) >= 30.0) {
- break;
- }
- }
-
- if (i == QINDEX_RANGE)
- i--;
-
- return i;
-}
-
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
- (void) size;
- (void) dest;
- (void) frame_flags;
-
-
- vp9_set_quantizer(cpi, find_fp_qindex());
- vp9_first_pass(cpi);
-}
-
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
- // write the frame
- FILE *yframe;
- int i;
- char filename[255];
-
- sprintf(filename, "cx\\y%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->y_height; i++)
- fwrite(frame->y_buffer + i * frame->y_stride,
- frame->y_width, 1, yframe);
-
- fclose(yframe);
- sprintf(filename, "cx\\u%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->u_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
- sprintf(filename, "cx\\v%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->v_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
-}
-#endif
-
-static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
-#define EDGE_THRESH 128
- int i, j;
- int num_edge_pels = 0;
- int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
- unsigned char *prev = frame->y_buffer + 1;
- unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;
- unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;
- for (i = 1; i < frame->y_height - 1; i++) {
- for (j = 1; j < frame->y_width - 1; j++) {
- /* Sobel hor and ver gradients */
- int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
- int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
- h = (h < 0 ? -h : h);
- v = (v < 0 ? -v : v);
- if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;
- curr++;
- prev++;
- next++;
- }
- curr += frame->y_stride - frame->y_width + 2;
- prev += frame->y_stride - frame->y_width + 2;
- next += frame->y_stride - frame->y_width + 2;
- }
- return (double)num_edge_pels / (double)num_pels;
-}
-
-// Function to test for conditions that indicate we should loop
-// back and recode a frame.
-static BOOL recode_loop_test(VP9_COMP *cpi,
- int high_limit, int low_limit,
- int q, int maxq, int minq) {
- BOOL force_recode = FALSE;
- VP9_COMMON *cm = &cpi->common;
-
- // Is frame recode allowed at all
- // Yes if either recode mode 1 is selected or mode two is selcted
- // and the frame is a key frame. golden frame or alt_ref_frame
- if ((cpi->sf.recode_loop == 1) ||
- ((cpi->sf.recode_loop == 2) &&
- ((cm->frame_type == KEY_FRAME) ||
- cm->refresh_golden_frame ||
- cm->refresh_alt_ref_frame))) {
- // General over and under shoot tests
- if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
- ((cpi->projected_frame_size < low_limit) && (q > minq))) {
- force_recode = TRUE;
- }
- // Special Constrained quality tests
- else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- // Undershoot and below auto cq level
- if ((q > cpi->cq_target_quality) &&
- (cpi->projected_frame_size <
- ((cpi->this_frame_target * 7) >> 3))) {
- force_recode = TRUE;
- }
- // Severe undershoot and between auto and user cq level
- else if ((q > cpi->oxcf.cq_level) &&
- (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
- (cpi->active_best_quality > cpi->oxcf.cq_level)) {
- force_recode = TRUE;
- cpi->active_best_quality = cpi->oxcf.cq_level;
- }
- }
- }
-
- return force_recode;
-}
-
-static void update_reference_frames(VP9_COMMON *cm) {
- YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
-
- // At this point the new frame has been encoded.
- // If any buffer copy / swapping is signaled it should be done here.
-
- if (cm->frame_type == KEY_FRAME) {
- yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;
-
- yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
- yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
-
- cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
- } else { /* For non key frames */
- if (cm->refresh_alt_ref_frame) {
- assert(!cm->copy_buffer_to_arf);
-
- cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;
- cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
- cm->alt_fb_idx = cm->new_fb_idx;
- } else if (cm->copy_buffer_to_arf) {
- assert(!(cm->copy_buffer_to_arf & ~0x3));
-
- if (cm->copy_buffer_to_arf == 1) {
- if (cm->alt_fb_idx != cm->lst_fb_idx) {
- yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;
- yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
- cm->alt_fb_idx = cm->lst_fb_idx;
- }
- } else { /* if (cm->copy_buffer_to_arf == 2) */
- if (cm->alt_fb_idx != cm->gld_fb_idx) {
- yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;
- yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
- cm->alt_fb_idx = cm->gld_fb_idx;
- }
- }
- }
-
- if (cm->refresh_golden_frame) {
- assert(!cm->copy_buffer_to_gf);
-
- cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;
- cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
- cm->gld_fb_idx = cm->new_fb_idx;
- } else if (cm->copy_buffer_to_gf) {
- assert(!(cm->copy_buffer_to_arf & ~0x3));
-
- if (cm->copy_buffer_to_gf == 1) {
- if (cm->gld_fb_idx != cm->lst_fb_idx) {
- yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;
- yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
- cm->gld_fb_idx = cm->lst_fb_idx;
- }
- } else { /* if (cm->copy_buffer_to_gf == 2) */
- if (cm->alt_fb_idx != cm->gld_fb_idx) {
- yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;
- yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
- cm->gld_fb_idx = cm->alt_fb_idx;
- }
- }
- }
- }
-
- if (cm->refresh_last_frame) {
- cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;
- cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;
- cm->lst_fb_idx = cm->new_fb_idx;
- }
-}
-
-static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
- if (cm->no_lpf) {
- cm->filter_level = 0;
- }
-#if CONFIG_LOSSLESS
- else if (cpi->oxcf.lossless) {
- cm->filter_level = 0;
- }
-#endif
- else {
- struct vpx_usec_timer timer;
-
- vp9_clear_system_state();
-
- vpx_usec_timer_start(&timer);
- if (cpi->sf.auto_filter == 0)
- vp9_pick_filter_level_fast(cpi->Source, cpi);
- else
- vp9_pick_filter_level(cpi->Source, cpi);
-
- vpx_usec_timer_mark(&timer);
- cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
- }
-
- if (cm->filter_level > 0) {
- vp9_set_alt_lf_level(cpi, cm->filter_level);
- vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);
- }
-
- vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
-}
-
-#if CONFIG_PRED_FILTER
-void select_pred_filter_mode(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- int prob_pred_filter_off = cm->prob_pred_filter_off;
-
- // Force filter on/off if probability is extreme
- if (prob_pred_filter_off >= 255 * 0.95)
- cm->pred_filter_mode = 0; // Off at the frame level
- else if (prob_pred_filter_off <= 255 * 0.05)
- cm->pred_filter_mode = 1; // On at the frame level
- else
- cm->pred_filter_mode = 2; // Selectable at the MB level
-}
-
-void update_pred_filt_prob(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- int prob_pred_filter_off;
-
- // Based on the selection in the previous frame determine what mode
- // to use for the current frame and work out the signaling probability
- if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {
- prob_pred_filter_off = cpi->pred_filter_off_count * 256 /
- (cpi->pred_filter_on_count + cpi->pred_filter_off_count);
-
- if (prob_pred_filter_off < 1)
- prob_pred_filter_off = 1;
-
- if (prob_pred_filter_off > 255)
- prob_pred_filter_off = 255;
-
- cm->prob_pred_filter_off = prob_pred_filter_off;
- } else
- cm->prob_pred_filter_off = 128;
- /*
- {
- FILE *fp = fopen("filt_use.txt", "a");
- fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,
- cpi->pred_filter_on_count, cm->prob_pred_filter_off);
- fclose(fp);
- }
- */
-}
-#endif
-
-static void encode_frame_to_data_rate
-(
- VP9_COMP *cpi,
- unsigned long *size,
- unsigned char *dest,
- unsigned int *frame_flags
-) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- int Q;
- int frame_over_shoot_limit;
- int frame_under_shoot_limit;
-
- int Loop = FALSE;
- int loop_count;
- int this_q;
- int last_zbin_oq;
-
- int q_low;
- int q_high;
- int zbin_oq_high;
- int zbin_oq_low = 0;
-
- int top_index;
- int bottom_index;
- int active_worst_qchanged = FALSE;
-
- int overshoot_seen = FALSE;
- int undershoot_seen = FALSE;
-
- int loop_size_estimate = 0;
-
- SPEED_FEATURES *sf = &cpi->sf;
-#if RESET_FOREACH_FILTER
- int q_low0;
- int q_high0;
- int zbin_oq_high0;
- int zbin_oq_low0 = 0;
- int Q0;
- int last_zbin_oq0;
- int active_best_quality0;
- int active_worst_quality0;
- double rate_correction_factor0;
- double gf_rate_correction_factor0;
-#endif
-
- /* list of filters to search over */
- int mcomp_filters_to_search[] = {
- EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
- };
- int mcomp_filters = sizeof(mcomp_filters_to_search) /
- sizeof(*mcomp_filters_to_search);
- int mcomp_filter_index = 0;
- INT64 mcomp_filter_cost[4];
-
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state();
-
-
- // For an alt ref frame in 2 pass we skip the call to the second
- // pass function that sets the target bandwidth so must set it here
- if (cpi->common.refresh_alt_ref_frame) {
- cpi->per_frame_bandwidth = cpi->twopass.gf_bits; // Per frame bit target for the alt ref frame
- cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate; // per second target bitrate
- }
-
- // Default turn off buffer to buffer copying
- cm->copy_buffer_to_gf = 0;
- cm->copy_buffer_to_arf = 0;
-
- // Clear zbin over-quant value and mode boost values.
- cpi->zbin_over_quant = 0;
- cpi->zbin_mode_boost = 0;
-
- // Enable or disable mode based tweaking of the zbin
- // For 2 Pass Only used where GF/ARF prediction quality
- // is above a threshold
- cpi->zbin_mode_boost = 0;
-#if CONFIG_LOSSLESS
- cpi->zbin_mode_boost_enabled = FALSE;
-#else
- cpi->zbin_mode_boost_enabled = TRUE;
-#endif
- if (cpi->gfu_boost <= 400) {
- cpi->zbin_mode_boost_enabled = FALSE;
- }
-
- // Current default encoder behaviour for the altref sign bias
- if (cpi->source_alt_ref_active)
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
- else
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
-
- // Check to see if a key frame is signalled
- // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
- if ((cm->current_video_frame == 0) ||
- (cm->frame_flags & FRAMEFLAGS_KEY) ||
- (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
- // Key frame from VFW/auto-keyframe/first frame
- cm->frame_type = KEY_FRAME;
- }
-
- // Set default state for segment based loop filter update flags
- xd->mode_ref_lf_delta_update = 0;
-
- // Set various flags etc to special state if it is a key frame
- if (cm->frame_type == KEY_FRAME) {
- int i;
-
- // Reset the loop filter deltas and segmentation map
- setup_features(cpi);
-
- // If segmentation is enabled force a map update for key frames
- if (xd->segmentation_enabled) {
- xd->update_mb_segmentation_map = 1;
- xd->update_mb_segmentation_data = 1;
- }
-
- // The alternate reference frame cannot be active for a key frame
- cpi->source_alt_ref_active = FALSE;
-
- // Reset the RD threshold multipliers to default of * 1 (128)
- for (i = 0; i < MAX_MODES; i++) {
- cpi->rd_thresh_mult[i] = 128;
- }
- }
-
- // Test code for new segment features
- init_seg_features(cpi);
-
- // Decide how big to make the frame
- vp9_pick_frame_size(cpi);
-
- vp9_clear_system_state();
-
- // Set an active best quality and if necessary active worst quality
- Q = cpi->active_worst_quality;
-
- if (cm->frame_type == KEY_FRAME) {
- int high = 2000;
- int low = 400;
-
- if (cpi->kf_boost > high)
- cpi->active_best_quality = kf_low_motion_minq[Q];
- else if (cpi->kf_boost < low)
- cpi->active_best_quality = kf_high_motion_minq[Q];
- else {
- int gap = high - low;
- int offset = high - cpi->kf_boost;
- int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];
- int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
- cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;
- }
-
- // Make an adjustment based on the %s static
- // The main impact of this is at lower Q to prevent overly large key
- // frames unless a lot of the image is static.
- if (cpi->kf_zeromotion_pct < 64)
- cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);
-
- // Special case for key frames forced because we have reached
- // the maximum key frame interval. Here force the Q to a range
- // based on the ambient Q to reduce the risk of popping
- if (cpi->this_key_frame_forced) {
- int delta_qindex;
- int qindex = cpi->last_boosted_qindex;
-
- delta_qindex = compute_qdelta(cpi, qindex,
- (qindex * 0.75));
-
- cpi->active_best_quality = qindex + delta_qindex;
- if (cpi->active_best_quality < cpi->best_quality)
- cpi->active_best_quality = cpi->best_quality;
- }
- }
-
- else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {
- int high = 2000;
- int low = 400;
-
- // Use the lower of cpi->active_worst_quality and recent
- // average Q as basis for GF/ARF Q limit unless last frame was
- // a key frame.
- if ((cpi->frames_since_key > 1) &&
- (cpi->avg_frame_qindex < cpi->active_worst_quality)) {
- Q = cpi->avg_frame_qindex;
- }
-
- // For constrained quality dont allow Q less than the cq level
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (Q < cpi->cq_target_quality)) {
- Q = cpi->cq_target_quality;
- }
-
- if (cpi->gfu_boost > high)
- cpi->active_best_quality = gf_low_motion_minq[Q];
- else if (cpi->gfu_boost < low)
- cpi->active_best_quality = gf_high_motion_minq[Q];
- else {
- int gap = high - low;
- int offset = high - cpi->gfu_boost;
- int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];
- int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
- cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;
- }
-
- // Constrained quality use slightly lower active best.
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- cpi->active_best_quality =
- cpi->active_best_quality * 15 / 16;
- }
- } else {
- cpi->active_best_quality = inter_minq[Q];
-
- // For the constant/constrained quality mode we dont want
- // q to fall below the cq level.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (cpi->active_best_quality < cpi->cq_target_quality)) {
- // If we are strongly undershooting the target rate in the last
- // frames then use the user passed in cq value not the auto
- // cq value.
- if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
- cpi->active_best_quality = cpi->oxcf.cq_level;
- else
- cpi->active_best_quality = cpi->cq_target_quality;
- }
- }
-
- // Clip the active best and worst quality values to limits
- if (cpi->active_worst_quality > cpi->worst_quality)
- cpi->active_worst_quality = cpi->worst_quality;
-
- if (cpi->active_best_quality < cpi->best_quality)
- cpi->active_best_quality = cpi->best_quality;
-
- if (cpi->active_best_quality > cpi->worst_quality)
- cpi->active_best_quality = cpi->worst_quality;
-
- if (cpi->active_worst_quality < cpi->active_best_quality)
- cpi->active_worst_quality = cpi->active_best_quality;
-
- // Specuial case code to try and match quality with forced key frames
- if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
- Q = cpi->last_boosted_qindex;
- } else {
- // Determine initial Q to try
- Q = vp9_regulate_q(cpi, cpi->this_frame_target);
- }
- last_zbin_oq = cpi->zbin_over_quant;
-
- // Set highest allowed value for Zbin over quant
- if (cm->frame_type == KEY_FRAME)
- zbin_oq_high = 0; // ZBIN_OQ_MAX/16
- else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
- zbin_oq_high = 16;
- else
- zbin_oq_high = ZBIN_OQ_MAX;
-
- vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
- &frame_over_shoot_limit);
-
- // Limit Q range for the adaptive loop.
- bottom_index = cpi->active_best_quality;
- top_index = cpi->active_worst_quality;
- q_low = cpi->active_best_quality;
- q_high = cpi->active_worst_quality;
-
- loop_count = 0;
-
- if (cm->frame_type != KEY_FRAME) {
- /* TODO: Decide this more intelligently */
- if (sf->search_best_filter) {
- cm->mcomp_filter_type = mcomp_filters_to_search[0];
- mcomp_filter_index = 0;
- } else {
- cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
- }
- /* TODO: Decide this more intelligently */
- xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);
- }
-
-#if CONFIG_POSTPROC
-
- if (cpi->oxcf.noise_sensitivity > 0) {
- unsigned char *src;
- int l = 0;
-
- switch (cpi->oxcf.noise_sensitivity) {
- case 1:
- l = 20;
- break;
- case 2:
- l = 40;
- break;
- case 3:
- l = 60;
- break;
- case 4:
-
- case 5:
- l = 100;
- break;
- case 6:
- l = 150;
- break;
- }
-
-
- if (cm->frame_type == KEY_FRAME) {
- vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0, RTCD(postproc));
- } else {
- vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0, RTCD(postproc));
-
- src = cpi->Source->y_buffer;
-
- if (cpi->Source->y_stride < 0) {
- src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
- }
- }
- }
-
-#endif
-
-#ifdef OUTPUT_YUV_SRC
- vp9_write_yuv_frame(cpi->Source);
-#endif
-
-#if RESET_FOREACH_FILTER
- if (sf->search_best_filter) {
- q_low0 = q_low;
- q_high0 = q_high;
- Q0 = Q;
- zbin_oq_low0 = zbin_oq_low;
- zbin_oq_high0 = zbin_oq_high;
- last_zbin_oq0 = last_zbin_oq;
- rate_correction_factor0 = cpi->rate_correction_factor;
- gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
- active_best_quality0 = cpi->active_best_quality;
- active_worst_quality0 = cpi->active_worst_quality;
- }
-#endif
- do {
- vp9_clear_system_state(); // __asm emms;
-
- vp9_set_quantizer(cpi, Q);
- this_q = Q;
-
- if (loop_count == 0) {
-
- // setup skip prob for costing in mode/mv decision
- if (cpi->common.mb_no_coeff_skip) {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; k++)
- cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
-
- if (cm->frame_type != KEY_FRAME) {
- if (cpi->common.refresh_alt_ref_frame) {
- for (k = 0; k < MBSKIP_CONTEXTS; k++) {
- if (cpi->last_skip_false_probs[2][k] != 0)
- cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
- }
- } else if (cpi->common.refresh_golden_frame) {
- for (k = 0; k < MBSKIP_CONTEXTS; k++) {
- if (cpi->last_skip_false_probs[1][k] != 0)
- cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
- }
- } else {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; k++) {
- if (cpi->last_skip_false_probs[0][k] != 0)
- cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];
- }
- }
-
- // as this is for cost estimate, let's make sure it does not
- // get extreme either way
- {
- int k;
- for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
- if (cm->mbskip_pred_probs[k] < 5)
- cm->mbskip_pred_probs[k] = 5;
-
- if (cm->mbskip_pred_probs[k] > 250)
- cm->mbskip_pred_probs[k] = 250;
-
- if (cpi->is_src_frame_alt_ref)
- cm->mbskip_pred_probs[k] = 1;
- }
- }
- }
- }
-
- // Set up entropy depending on frame type.
- if (cm->frame_type == KEY_FRAME)
- vp9_setup_key_frame(cpi);
- else
- vp9_setup_inter_frame(cpi);
- }
-
- // transform / motion compensation build reconstruction frame
-
- vp9_encode_frame(cpi);
-
- // Update the skip mb flag probabilities based on the distribution
- // seen in the last encoder iteration.
- update_base_skip_probs(cpi);
-
- vp9_clear_system_state(); // __asm emms;
-
-#if CONFIG_PRED_FILTER
- // Update prediction filter on/off probability based on
- // selection made for the current frame
- if (cm->frame_type != KEY_FRAME)
- update_pred_filt_prob(cpi);
-#endif
-
- // Dummy pack of the bitstream using up to date stats to get an
- // accurate estimate of output frame size to determine if we need
- // to recode.
- vp9_save_coding_context(cpi);
- cpi->dummy_packing = 1;
- vp9_pack_bitstream(cpi, dest, size);
- cpi->projected_frame_size = (*size) << 3;
- vp9_restore_coding_context(cpi);
-
- if (frame_over_shoot_limit == 0)
- frame_over_shoot_limit = 1;
- active_worst_qchanged = FALSE;
-
- // Special case handling for forced key frames
- if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
- int last_q = Q;
- int kf_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
-
- int high_err_target = cpi->ambient_err;
- int low_err_target = (cpi->ambient_err >> 1);
-
- // Prevent possible divide by zero error below for perfect KF
- kf_err += (!kf_err);
-
- // The key frame is not good enough or we can afford
- // to make it better without undue risk of popping.
- if (((kf_err > high_err_target) &&
- (cpi->projected_frame_size <= frame_over_shoot_limit)) ||
- ((kf_err > low_err_target) &&
- (cpi->projected_frame_size <= frame_under_shoot_limit))) {
- // Lower q_high
- q_high = (Q > q_low) ? (Q - 1) : q_low;
-
- // Adjust Q
- Q = (Q * high_err_target) / kf_err;
- if (Q < ((q_high + q_low) >> 1))
- Q = (q_high + q_low) >> 1;
- }
- // The key frame is much better than the previous frame
- else if ((kf_err < low_err_target) &&
- (cpi->projected_frame_size >= frame_under_shoot_limit)) {
- // Raise q_low
- q_low = (Q < q_high) ? (Q + 1) : q_high;
-
- // Adjust Q
- Q = (Q * low_err_target) / kf_err;
- if (Q > ((q_high + q_low + 1) >> 1))
- Q = (q_high + q_low + 1) >> 1;
- }
-
- // Clamp Q to upper and lower limits:
- if (Q > q_high)
- Q = q_high;
- else if (Q < q_low)
- Q = q_low;
-
- Loop = ((Q != last_q)) ? TRUE : FALSE;
- }
-
- // Is the projected frame size out of range and are we allowed to attempt to recode.
- else if (recode_loop_test(cpi,
- frame_over_shoot_limit, frame_under_shoot_limit,
- Q, top_index, bottom_index)) {
- int last_q = Q;
- int Retries = 0;
-
- // Frame size out of permitted range:
- // Update correction factor & compute new Q to try...
-
- // Frame is too large
- if (cpi->projected_frame_size > cpi->this_frame_target) {
- q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
-
- if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low
- zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
-
- if (undershoot_seen || (loop_count > 1)) {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
-
- Q = (q_high + q_low + 1) / 2;
-
- // Adjust cpi->zbin_over_quant (only allowed when Q is max)
- if (Q < MAXQ)
- cpi->zbin_over_quant = 0;
- else {
- zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
- cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
- }
- } else {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
-
- Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
- while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {
- vp9_update_rate_correction_factors(cpi, 0);
- Q = vp9_regulate_q(cpi, cpi->this_frame_target);
- Retries++;
- }
- }
-
- overshoot_seen = TRUE;
- }
- // Frame is too small
- else {
- if (cpi->zbin_over_quant == 0)
- q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
- else // else lower zbin_oq_high
- zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
-
- if (overshoot_seen || (loop_count > 1)) {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 1);
-
- Q = (q_high + q_low) / 2;
-
- // Adjust cpi->zbin_over_quant (only allowed when Q is max)
- if (Q < MAXQ)
- cpi->zbin_over_quant = 0;
- else
- cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
- } else {
- // Update rate_correction_factor unless cpi->active_worst_quality has changed.
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 0);
-
- Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
- // Special case reset for qlow for constrained quality.
- // This should only trigger where there is very substantial
- // undershoot on a frame and the auto cq level is above
- // the user passsed in value.
- if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
- (Q < q_low)) {
- q_low = Q;
- }
-
- while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {
- vp9_update_rate_correction_factors(cpi, 0);
- Q = vp9_regulate_q(cpi, cpi->this_frame_target);
- Retries++;
- }
- }
-
- undershoot_seen = TRUE;
- }
-
- // Clamp Q to upper and lower limits:
- if (Q > q_high)
- Q = q_high;
- else if (Q < q_low)
- Q = q_low;
-
- // Clamp cpi->zbin_over_quant
- cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?
- zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?
- zbin_oq_high : cpi->zbin_over_quant;
-
- // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
- Loop = ((Q != last_q)) ? TRUE : FALSE;
- last_zbin_oq = cpi->zbin_over_quant;
- } else
- Loop = FALSE;
-
- if (cpi->is_src_frame_alt_ref)
- Loop = FALSE;
-
- if (cm->frame_type != KEY_FRAME &&
- !sf->search_best_filter &&
- cm->mcomp_filter_type == SWITCHABLE) {
- int interp_factor = Q / 3; /* denominator is 256 */
- int count[VP9_SWITCHABLE_FILTERS];
- int tot_count = 0, c = 0, thr;
- int i, j;
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- count[i] = 0;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
- count[i] += cpi->switchable_interp_count[j][i];
- }
- tot_count += count[i];
- }
-
- thr = ((tot_count * interp_factor + 128) >> 8);
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- c += (count[i] >= thr);
- }
- if (c == 1) {
- /* Mostly one filter is used. So set the filter at frame level */
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- if (count[i]) {
- cm->mcomp_filter_type = vp9_switchable_interp[i];
- Loop = TRUE; /* Make sure to loop since the filter changed */
- break;
- }
- }
- }
- }
-
- if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
- if (mcomp_filter_index < mcomp_filters) {
- INT64 err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
- INT64 rate = cpi->projected_frame_size << 8;
- mcomp_filter_cost[mcomp_filter_index] =
- (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
- mcomp_filter_index++;
- if (mcomp_filter_index < mcomp_filters) {
- cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
- loop_count = -1;
- Loop = TRUE;
- } else {
- int f;
- INT64 best_cost = mcomp_filter_cost[0];
- int mcomp_best_filter = mcomp_filters_to_search[0];
- for (f = 1; f < mcomp_filters; f++) {
- if (mcomp_filter_cost[f] < best_cost) {
- mcomp_best_filter = mcomp_filters_to_search[f];
- best_cost = mcomp_filter_cost[f];
- }
- }
- if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
- loop_count = -1;
- Loop = TRUE;
- cm->mcomp_filter_type = mcomp_best_filter;
- }
- /*
- printf(" best filter = %d, ( ", mcomp_best_filter);
- for (f=0;f<mcomp_filters; f++) printf("%d ", mcomp_filter_cost[f]);
- printf(")\n");
- */
- }
-#if RESET_FOREACH_FILTER
- if (Loop == TRUE) {
- overshoot_seen = FALSE;
- undershoot_seen = FALSE;
- zbin_oq_low = zbin_oq_low0;
- zbin_oq_high = zbin_oq_high0;
- q_low = q_low0;
- q_high = q_high0;
- Q = Q0;
- cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;
- cpi->rate_correction_factor = rate_correction_factor0;
- cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
- cpi->active_best_quality = active_best_quality0;
- cpi->active_worst_quality = active_worst_quality0;
- }
-#endif
- }
- }
-
- if (Loop == TRUE) {
- loop_count++;
-#if CONFIG_INTERNAL_STATS
- cpi->tot_recode_hits++;
-#endif
- }
- } while (Loop == TRUE);
-
- // Special case code to reduce pulsing when key frames are forced at a
- // fixed interval. Note the reconstruction error if it is the frame before
- // the force key frame
- if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
- cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
- }
-
- // This frame's MVs are saved and will be used in next frame's MV
- // prediction. Last frame has one more line(add to bottom) and one
- // more column(add to right) than cm->mip. The edge elements are
- // initialized to 0.
- if (cm->show_frame) { // do not save for altref frame
- int mb_row;
- int mb_col;
- MODE_INFO *tmp = cm->mip;
-
- if (cm->frame_type != KEY_FRAME) {
- for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {
- for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {
- if (tmp->mbmi.ref_frame != INTRA_FRAME)
- cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;
-
- cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
- cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;
- tmp++;
- }
- }
- }
- }
-
- // Update the GF useage maps.
- // This is done after completing the compression of a frame when all modes
- // etc. are finalized but before loop filter
- vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
-
- if (cm->frame_type == KEY_FRAME)
- cm->refresh_last_frame = 1;
-
-#if 0
- {
- FILE *f = fopen("gfactive.stt", "a");
- fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
- fclose(f);
- }
-#endif
-
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
-
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame);
- else
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 1000);
-#endif
-
- // Pick the loop filter level for the frame.
- loopfilter_frame(cpi, cm);
-
- // build the bitstream
- cpi->dummy_packing = 0;
- vp9_pack_bitstream(cpi, dest, size);
-
- if (cpi->mb.e_mbd.update_mb_segmentation_map) {
- update_reference_segmentation_map(cpi);
- }
-
-#if CONFIG_PRED_FILTER
- // Select the prediction filtering mode to use for the
- // next frame based on the current frame selections
- if (cm->frame_type != KEY_FRAME)
- select_pred_filter_mode(cpi);
-#endif
-
- update_reference_frames(cm);
- vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
- vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);
- vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
- vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);
- vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
- vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
- cpi->hybrid_coef_counts_16x16);
- vp9_adapt_coef_probs(&cpi->common);
- if (cpi->common.frame_type != KEY_FRAME) {
- vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
- vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
- vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
- vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
- vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
- vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
- vp9_adapt_mode_probs(&cpi->common);
-
- cpi->common.fc.NMVcount = cpi->NMVcount;
- vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
- vp9_update_mode_context(&cpi->common);
- }
-
- /* Move storing frame_type out of the above loop since it is also
- * needed in motion search besides loopfilter */
- cm->last_frame_type = cm->frame_type;
-
- // Keep a copy of the size estimate used in the loop
- loop_size_estimate = cpi->projected_frame_size;
-
- // Update rate control heuristics
- cpi->total_byte_count += (*size);
- cpi->projected_frame_size = (*size) << 3;
-
- if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, 2);
-
- cpi->last_q[cm->frame_type] = cm->base_qindex;
-
- // Keep record of last boosted (KF/KF/ARF) Q value.
- // If the current frame is coded at a lower Q then we also update it.
- // If all mbs in this group are skipped only update if the Q value is
- // better than that already stored.
- // This is used to help set quality in forced key frames to reduce popping
- if ((cm->base_qindex < cpi->last_boosted_qindex) ||
- ((cpi->static_mb_pct < 100) &&
- ((cm->frame_type == KEY_FRAME) ||
- cm->refresh_alt_ref_frame ||
- (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
- cpi->last_boosted_qindex = cm->base_qindex;
- }
-
- if (cm->frame_type == KEY_FRAME) {
- vp9_adjust_key_frame_context(cpi);
- }
-
- // Keep a record of ambient average Q.
- if (cm->frame_type != KEY_FRAME)
- cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
-
- // Keep a record from which we can calculate the average Q excluding GF updates and key frames
- if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {
- cpi->ni_frames++;
- cpi->tot_q += vp9_convert_qindex_to_q(Q);
- cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
-
- // Calculate the average Q for normal inter frames (not key or GFU
- // frames).
- cpi->ni_tot_qi += Q;
- cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
- }
-
- // Update the buffer level variable.
- // Non-viewable frames are a special case and are treated as pure overhead.
- if (!cm->show_frame)
- cpi->bits_off_target -= cpi->projected_frame_size;
- else
- cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
-
- // Clip the buffer level at the maximum buffer size
- if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
- cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-
- // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
- cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
- cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
- cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
- cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
-
- // Actual bits spent
- cpi->total_actual_bits += cpi->projected_frame_size;
-
- // Debug stats
- cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
-
- cpi->buffer_level = cpi->bits_off_target;
-
- // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
- if (cm->frame_type == KEY_FRAME) {
- cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
-
- if (cpi->twopass.kf_group_bits < 0)
- cpi->twopass.kf_group_bits = 0;
- } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {
- cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
-
- if (cpi->twopass.gf_group_bits < 0)
- cpi->twopass.gf_group_bits = 0;
- }
-
- // Update the skip mb flag probabilities based on the distribution seen
- // in this frame.
- update_base_skip_probs(cpi);
-
-#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS
- {
- FILE *f = fopen("mv_ref_dist.stt", "a");
- unsigned int i;
- for (i = 0; i < MAX_MV_REFS; ++i) {
- fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);
- }
- fprintf(f, "\n" );
-
- fclose(f);
- }
-#endif
-
-#if 0// 1 && CONFIG_INTERNAL_STATS
- {
- FILE *f = fopen("tmp.stt", "a");
- int recon_err;
-
- vp9_clear_system_state(); // __asm emms;
-
- recon_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
-
- if (cpi->twopass.total_left_stats->coded_error != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
- "%10.3f %8d %10d %10d %10d\n",
- cpi->common.current_video_frame, cpi->this_frame_target,
- cpi->projected_frame_size, loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->zbin_over_quant,
- // cpi->avg_frame_qindex, cpi->zbin_over_quant,
- cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats->coded_error,
- (double)cpi->twopass.bits_left /
- cpi->twopass.total_left_stats->coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
- else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
- "%8d %10d %10d %10d\n",
- cpi->common.current_video_frame,
- cpi->this_frame_target, cpi->projected_frame_size,
- loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->zbin_over_quant,
- // cpi->avg_frame_qindex, cpi->zbin_over_quant,
- cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats->coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
-
- fclose(f);
-
- if (0) {
- FILE *fmodes = fopen("Modes.stt", "a");
- int i;
-
- fprintf(fmodes, "%6d:%1d:%1d:%1d ",
- cpi->common.current_video_frame,
- cm->frame_type, cm->refresh_golden_frame,
- cm->refresh_alt_ref_frame);
-
- for (i = 0; i < MAX_MODES; i++)
- fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-
- fprintf(fmodes, "\n");
-
- fclose(fmodes);
- }
- }
-
-#endif
-
-#if 0
- // Debug stats for segment feature experiments.
- print_seg_map(cpi);
-#endif
-
- // If this was a kf or Gf note the Q
- if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
- cm->last_kf_gf_q = cm->base_qindex;
-
- if (cm->refresh_golden_frame == 1)
- cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
- else
- cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
-
- if (cm->refresh_alt_ref_frame == 1)
- cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
- else
- cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
-
-
- if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
- cpi->gold_is_last = 1;
- else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
- cpi->gold_is_last = 0;
-
- if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
- cpi->alt_is_last = 1;
- else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
- cpi->alt_is_last = 0;
-
- if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
- cpi->gold_is_alt = 1;
- else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
- cpi->gold_is_alt = 0;
-
- cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
- if (cpi->gold_is_last)
- cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-
- if (cpi->alt_is_last)
- cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
- if (cpi->gold_is_alt)
- cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
- if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
- // Update the alternate reference frame stats as appropriate.
- update_alt_ref_frame_stats(cpi);
- else
- // Update the Golden frame stats as appropriate.
- update_golden_frame_stats(cpi);
-
- if (cm->frame_type == KEY_FRAME) {
- // Tell the caller that the frame was coded as a key frame
- *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
-
- // As this frame is a key frame the next defaults to an inter frame.
- cm->frame_type = INTER_FRAME;
- } else {
- *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
- }
-
- // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- xd->mode_ref_lf_delta_update = 0;
-
-
- // Dont increment frame counters if this was an altref buffer update not a real frame
- if (cm->show_frame) {
- cm->current_video_frame++;
- cpi->frames_since_key++;
- }
-
- // reset to normal state now that we are done.
-
-
-
-#if 0
- {
- char filename[512];
- FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
- recon_file = fopen(filename, "wb");
- fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
- cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
- fclose(recon_file);
- }
-#endif
-#ifdef OUTPUT_YUV_REC
- vp9_write_yuv_rec_frame(cm);
-#endif
-
- if (cm->show_frame) {
- vpx_memcpy(cm->prev_mip, cm->mip,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
- } else {
- vpx_memset(cm->prev_mip, 0,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
- }
-}
-
-static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
- unsigned char *dest, unsigned int *frame_flags) {
-
- if (!cpi->common.refresh_alt_ref_frame)
- vp9_second_pass(cpi);
-
- encode_frame_to_data_rate(cpi, size, dest, frame_flags);
- cpi->twopass.bits_left -= 8 * *size;
-
- if (!cpi->common.refresh_alt_ref_frame) {
- double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
- double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
-
- if (two_pass_min_rate < lower_bounds_min_rate)
- two_pass_min_rate = lower_bounds_min_rate;
-
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
- }
-}
-
-// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
-#if HAVE_ARMV7
-extern void vp9_push_neon(int64_t *store);
-extern void vp9_pop_neon(int64_t *store);
-#endif
-
-
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
- YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
- int64_t end_time) {
-#if HAVE_ARMV7
- int64_t store_reg[8];
-#endif
- VP9_COMP *cpi = (VP9_COMP *) ptr;
- VP9_COMMON *cm = &cpi->common;
- struct vpx_usec_timer timer;
- int res = 0;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_push_neon(store_reg);
- }
-#endif
-
- vpx_usec_timer_start(&timer);
- if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
- cpi->active_map_enabled ? cpi->active_map : NULL))
- res = -1;
- cm->clr_type = sd->clrtype;
- vpx_usec_timer_mark(&timer);
- cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(store_reg);
- }
-#endif
-
- return res;
-}
-
-
-static int frame_is_reference(const VP9_COMP *cpi) {
- const VP9_COMMON *cm = &cpi->common;
- const MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
- || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
- || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
- || cm->refresh_entropy_probs
- || xd->mode_ref_lf_delta_update
- || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
-}
-
-
-int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
- unsigned long *size, unsigned char *dest,
- int64_t *time_stamp, int64_t *time_end, int flush) {
-#if HAVE_ARMV7
- int64_t store_reg[8];
-#endif
- VP9_COMP *cpi = (VP9_COMP *) ptr;
- VP9_COMMON *cm = &cpi->common;
- struct vpx_usec_timer cmptimer;
- YV12_BUFFER_CONFIG *force_src_buffer = NULL;
-
- if (!cpi)
- return -1;
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_push_neon(store_reg);
- }
-#endif
-
- vpx_usec_timer_start(&cmptimer);
-
- cpi->source = NULL;
-
- cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
- // Should we code an alternate reference frame
- if (cpi->oxcf.play_alternate &&
- cpi->source_alt_ref_pending) {
- if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,
- cpi->frames_till_gf_update_due))) {
- cpi->alt_ref_source = cpi->source;
- if (cpi->oxcf.arnr_max_frames > 0) {
- vp9_temporal_filter_prepare_c(cpi,
- cpi->frames_till_gf_update_due);
- force_src_buffer = &cpi->alt_ref_buffer;
- }
- cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
- cm->refresh_alt_ref_frame = 1;
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 0;
- cm->show_frame = 0;
- cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag.
- cpi->is_src_frame_alt_ref = 0;
- }
- }
-
- if (!cpi->source) {
- if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
- cm->show_frame = 1;
-
- cpi->is_src_frame_alt_ref = cpi->alt_ref_source
- && (cpi->source == cpi->alt_ref_source);
-
- if (cpi->is_src_frame_alt_ref)
- cpi->alt_ref_source = NULL;
- }
- }
-
- if (cpi->source) {
- cpi->un_scaled_source =
- cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
- *time_stamp = cpi->source->ts_start;
- *time_end = cpi->source->ts_end;
- *frame_flags = cpi->source->flags;
- } else {
- *size = 0;
- if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
- vp9_end_first_pass(cpi); /* get last stats packet */
- cpi->twopass.first_pass_done = 1;
- }
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(store_reg);
- }
-#endif
- return -1;
- }
-
- if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
- cpi->first_time_stamp_ever = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_start;
- }
-
- // adjust frame rates based on timestamps given
- if (!cm->refresh_alt_ref_frame) {
- int64_t this_duration;
- int step = 0;
-
- if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
- this_duration = cpi->source->ts_end - cpi->source->ts_start;
- step = 1;
- } else {
- int64_t last_duration;
-
- this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
- last_duration = cpi->last_end_time_stamp_seen
- - cpi->last_time_stamp_seen;
- // do a step update if the duration changes by 10%
- if (last_duration)
- step = ((this_duration - last_duration) * 10 / last_duration);
- }
-
- if (this_duration) {
- if (step)
- vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
- else {
- double avg_duration, interval;
-
- /* Average this frame's rate into the last second's average
- * frame rate. If we haven't seen 1 second yet, then average
- * over the whole interval seen.
- */
- interval = cpi->source->ts_end - cpi->first_time_stamp_ever;
- if (interval > 10000000.0)
- interval = 10000000;
-
- avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
- avg_duration *= (interval - avg_duration + this_duration);
- avg_duration /= interval;
-
- vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
- }
- }
-
- cpi->last_time_stamp_seen = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_end;
- }
-
- // start with a 0 size frame
- *size = 0;
-
- // Clear down mmx registers
- vp9_clear_system_state(); // __asm emms;
-
- cm->frame_type = INTER_FRAME;
- cm->frame_flags = *frame_flags;
-
-#if 0
-
- if (cm->refresh_alt_ref_frame) {
- // cm->refresh_golden_frame = 1;
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 0;
- } else {
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 1;
- }
-
-#endif
- /* find a free buffer for the new frame */
- {
- int i = 0;
- for (; i < NUM_YV12_BUFFERS; i++) {
- if (!cm->yv12_fb[i].flags) {
- cm->new_fb_idx = i;
- break;
- }
- }
-
- assert(i < NUM_YV12_BUFFERS);
- }
- if (cpi->pass == 1) {
- Pass1Encode(cpi, size, dest, frame_flags);
- } else if (cpi->pass == 2) {
- Pass2Encode(cpi, size, dest, frame_flags);
- } else {
- encode_frame_to_data_rate(cpi, size, dest, frame_flags);
- }
-
- if (cm->refresh_entropy_probs) {
- if (cm->refresh_alt_ref_frame)
- vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));
- else
- vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
- }
-
- // if its a dropped frame honor the requests on subsequent frames
- if (*size > 0) {
- cpi->droppable = !frame_is_reference(cpi);
-
- // return to normal state
- cm->refresh_entropy_probs = 1;
- cm->refresh_alt_ref_frame = 0;
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 1;
- cm->frame_type = INTER_FRAME;
-
- }
-
- vpx_usec_timer_mark(&cmptimer);
- cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
-
- if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
- generate_psnr_packet(cpi);
- }
-
-#if CONFIG_INTERNAL_STATS
-
- if (cpi->pass != 1) {
- cpi->bytes += *size;
-
- if (cm->show_frame) {
-
- cpi->count++;
-
- if (cpi->b_calculate_psnr) {
- double ye, ue, ve;
- double frame_psnr;
- YV12_BUFFER_CONFIG *orig = cpi->Source;
- YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
- YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
- int y_samples = orig->y_height * orig->y_width;
- int uv_samples = orig->uv_height * orig->uv_width;
- int t_samples = y_samples + 2 * uv_samples;
- int64_t sq_error;
-
- ye = calc_plane_error(orig->y_buffer, orig->y_stride,
- recon->y_buffer, recon->y_stride, orig->y_width,
- orig->y_height);
-
- ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
- recon->u_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
-
- ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
- recon->v_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
-
- sq_error = ye + ue + ve;
-
- frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
- cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
- cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
- cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
- cpi->total_sq_error += sq_error;
- cpi->total += frame_psnr;
- {
- double frame_psnr2, frame_ssim2 = 0;
- double weight = 0;
-#if CONFIG_POSTPROC
- vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
-#endif
- vp9_clear_system_state();
-
- ye = calc_plane_error(orig->y_buffer, orig->y_stride,
- pp->y_buffer, pp->y_stride, orig->y_width,
- orig->y_height);
-
- ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
- pp->u_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
-
- ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
- pp->v_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
-
- sq_error = ye + ue + ve;
-
- frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
- cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
- cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
- cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
- cpi->total_sq_error2 += sq_error;
- cpi->totalp += frame_psnr2;
-
- frame_ssim2 = vp9_calc_ssim(cpi->Source,
- &cm->post_proc_buffer, 1, &weight);
-
- cpi->summed_quality += frame_ssim2 * weight;
- cpi->summed_weights += weight;
-#if 0
- {
- FILE *f = fopen("q_used.stt", "a");
- fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
- cpi->common.current_video_frame, y2, u2, v2,
- frame_psnr2, frame_ssim2);
- fclose(f);
- }
-#endif
- }
- }
-
- if (cpi->b_calculate_ssimg) {
- double y, u, v, frame_all;
- frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
- &y, &u, &v);
- cpi->total_ssimg_y += y;
- cpi->total_ssimg_u += u;
- cpi->total_ssimg_v += v;
- cpi->total_ssimg_all += frame_all;
- }
-
- }
- }
-
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp9_pop_neon(store_reg);
- }
-#endif
-
- return 0;
-}
-
-int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
- vp9_ppflags_t *flags) {
- VP9_COMP *cpi = (VP9_COMP *) comp;
-
- if (cpi->common.refresh_alt_ref_frame)
- return -1;
- else {
- int ret;
-#if CONFIG_POSTPROC
- ret = vp9_post_proc_frame(&cpi->common, dest, flags);
-#else
-
- if (cpi->common.frame_to_show) {
- *dest = *cpi->common.frame_to_show;
- dest->y_width = cpi->common.Width;
- dest->y_height = cpi->common.Height;
- dest->uv_height = cpi->common.Height / 2;
- ret = 0;
- } else {
- ret = -1;
- }
-
-#endif // !CONFIG_POSTPROC
- vp9_clear_system_state();
- return ret;
- }
-}
-
-int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[4], int delta_lf[4],
- unsigned int threshold[4]) {
- VP9_COMP *cpi = (VP9_COMP *) comp;
- signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- int i;
-
- if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
- return -1;
-
- if (!map) {
- vp9_disable_segmentation((VP9_PTR)cpi);
- return 0;
- }
-
- // Set the segmentation Map
- vp9_set_segmentation_map((VP9_PTR)cpi, map);
-
- // Activate segmentation.
- vp9_enable_segmentation((VP9_PTR)cpi);
-
- // Set up the quant segment data
- feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
- feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
- feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
- feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
-
- // Set up the loop segment data s
- feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
- feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
- feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
- feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
-
- cpi->segment_encode_breakout[0] = threshold[0];
- cpi->segment_encode_breakout[1] = threshold[1];
- cpi->segment_encode_breakout[2] = threshold[2];
- cpi->segment_encode_breakout[3] = threshold[3];
-
- // Enable the loop and quant changes in the feature mask
- for (i = 0; i < 4; i++) {
- if (delta_q[i])
- vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
- else
- vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
-
- if (delta_lf[i])
- vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
- else
- vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
- }
-
- // Initialise the feature data structure
- // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
- vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
-
- return 0;
-}
-
-int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
- unsigned int rows, unsigned int cols) {
- VP9_COMP *cpi = (VP9_COMP *) comp;
-
- if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
- if (map) {
- vpx_memcpy(cpi->active_map, map, rows * cols);
- cpi->active_map_enabled = 1;
- } else
- cpi->active_map_enabled = 0;
-
- return 0;
- } else {
- // cpi->active_map_enabled = 0;
- return -1;
- }
-}
-
-int vp9_set_internal_size(VP9_PTR comp,
- VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
- VP9_COMP *cpi = (VP9_COMP *) comp;
-
- if (horiz_mode <= ONETWO)
- cpi->common.horiz_scale = horiz_mode;
- else
- return -1;
-
- if (vert_mode <= ONETWO)
- cpi->common.vert_scale = vert_mode;
- else
- return -1;
-
- return 0;
-}
-
-
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
- int i, j;
- int Total = 0;
-
- unsigned char *src = source->y_buffer;
- unsigned char *dst = dest->y_buffer;
-
- // Loop through the Y plane raw and reconstruction data summing (square differences)
- for (i = 0; i < source->y_height; i += 16) {
- for (j = 0; j < source->y_width; j += 16) {
- unsigned int sse;
- Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
- &sse);
- }
-
- src += 16 * source->y_stride;
- dst += 16 * dest->y_stride;
- }
-
- return Total;
-}
-
-
-int vp9_get_quantizer(VP9_PTR c) {
- VP9_COMP *cpi = (VP9_COMP *) c;
- return cpi->common.base_qindex;
-}
--- a/vp8/encoder/onyx_int.h
+++ /dev/null
@@ -1,788 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_ONYX_INT_H
-#define __INC_ONYX_INT_H
-
-#include <stdio.h>
-#include "vpx_ports/config.h"
-#include "vp8/common/onyx.h"
-#include "treewriter.h"
-#include "tokenize.h"
-#include "vp8/common/onyxc_int.h"
-#include "variance.h"
-#include "encodemb.h"
-#include "quantize.h"
-#include "vp8/common/entropy.h"
-#include "vp8/common/entropymode.h"
-#include "vpx_ports/mem.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "mcomp.h"
-#include "temporal_filter.h"
-#include "vp8/common/findnearmv.h"
-#include "lookahead.h"
-
-// #define SPEEDSTATS 1
-#define MIN_GF_INTERVAL 4
-#define DEFAULT_GF_INTERVAL 7
-
-#define KEY_FRAME_CONTEXT 5
-
-#define MAX_LAG_BUFFERS 25
-
-#define AF_THRESH 25
-#define AF_THRESH2 100
-#define ARF_DECAY_THRESH 12
-
-#if CONFIG_PRED_FILTER
-#define MAX_MODES 54
-#else // CONFIG_PRED_FILTER
-#define MAX_MODES 42
-#endif // CONFIG_PRED_FILTER
-
-#define MIN_THRESHMULT 32
-#define MAX_THRESHMULT 512
-
-#define GF_ZEROMV_ZBIN_BOOST 12
-#define LF_ZEROMV_ZBIN_BOOST 6
-#define MV_ZBIN_BOOST 4
-#define ZBIN_OQ_MAX 192
-
-#define VP9_TEMPORAL_ALT_REF 1
-
-typedef struct {
- nmv_context nmvc;
- int nmvjointcost[MV_JOINTS];
- int nmvcosts[2][MV_VALS];
- int nmvcosts_hp[2][MV_VALS];
-
-#ifdef MODE_STATS
- // Stats
- int y_modes[VP9_YMODES];
- int uv_modes[VP9_UV_MODES];
- int i8x8_modes[VP9_I8X8_MODES];
- int b_modes[B_MODE_COUNT];
- int inter_y_modes[MB_MODE_COUNT];
- int inter_uv_modes[VP9_UV_MODES];
- int inter_b_modes[B_MODE_COUNT];
-#endif
-
- vp9_prob segment_pred_probs[PREDICTION_PROBS];
- unsigned char ref_pred_probs_update[PREDICTION_PROBS];
- vp9_prob ref_pred_probs[PREDICTION_PROBS];
- vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
-
- unsigned char *last_frame_seg_map_copy;
-
- // 0 = Intra, Last, GF, ARF
- signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
- // 0 = BPRED, ZERO_MV, MV, SPLIT
- signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
-
- vp9_prob coef_probs[BLOCK_TYPES]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
- vp9_prob hybrid_coef_probs[BLOCK_TYPES]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
- vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
- vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
- vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
- vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
- [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
- vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
- vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
- vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
- vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
- vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
- vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
-
- vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
-
- int mv_ref_ct[6][4][2];
- int mode_context[6][4];
- int mv_ref_ct_a[6][4][2];
- int mode_context_a[6][4];
-
-} CODING_CONTEXT;
-
-typedef struct {
- double frame;
- double intra_error;
- double coded_error;
- double sr_coded_error;
- double ssim_weighted_pred_err;
- double pcnt_inter;
- double pcnt_motion;
- double pcnt_second_ref;
- double pcnt_neutral;
- double MVr;
- double mvr_abs;
- double MVc;
- double mvc_abs;
- double MVrv;
- double MVcv;
- double mv_in_out_count;
- double new_mv_count;
- double duration;
- double count;
-}
-FIRSTPASS_STATS;
-
-typedef struct {
- int frames_so_far;
- double frame_intra_error;
- double frame_coded_error;
- double frame_pcnt_inter;
- double frame_pcnt_motion;
- double frame_mvr;
- double frame_mvr_abs;
- double frame_mvc;
- double frame_mvc_abs;
-
-} ONEPASS_FRAMESTATS;
-
-typedef struct {
- struct {
- int err;
- union {
- int_mv mv;
- MB_PREDICTION_MODE mode;
- } m;
- } ref[MAX_REF_FRAMES];
-} MBGRAPH_MB_STATS;
-
-typedef struct {
- MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
-
-#if CONFIG_PRED_FILTER
-typedef enum {
- THR_ZEROMV,
- THR_ZEROMV_FILT,
- THR_DC,
-
- THR_NEARESTMV,
- THR_NEARESTMV_FILT,
- THR_NEARMV,
- THR_NEARMV_FILT,
-
- THR_ZEROG,
- THR_ZEROG_FILT,
- THR_NEARESTG,
- THR_NEARESTG_FILT,
-
- THR_ZEROA,
- THR_ZEROA_FILT,
- THR_NEARESTA,
- THR_NEARESTA_FILT,
-
- THR_NEARG,
- THR_NEARG_FILT,
- THR_NEARA,
- THR_NEARA_FILT,
-
- THR_V_PRED,
- THR_H_PRED,
- THR_D45_PRED,
- THR_D135_PRED,
- THR_D117_PRED,
- THR_D153_PRED,
- THR_D27_PRED,
- THR_D63_PRED,
- THR_TM,
-
- THR_NEWMV,
- THR_NEWMV_FILT,
- THR_NEWG,
- THR_NEWG_FILT,
- THR_NEWA,
- THR_NEWA_FILT,
-
- THR_SPLITMV,
- THR_SPLITG,
- THR_SPLITA,
-
- THR_B_PRED,
- THR_I8X8_PRED,
-
- THR_COMP_ZEROLG,
- THR_COMP_NEARESTLG,
- THR_COMP_NEARLG,
-
- THR_COMP_ZEROLA,
- THR_COMP_NEARESTLA,
- THR_COMP_NEARLA,
-
- THR_COMP_ZEROGA,
- THR_COMP_NEARESTGA,
- THR_COMP_NEARGA,
-
- THR_COMP_NEWLG,
- THR_COMP_NEWLA,
- THR_COMP_NEWGA,
-
- THR_COMP_SPLITLG,
- THR_COMP_SPLITLA,
- THR_COMP_SPLITGA,
-}
-THR_MODES;
-#else
-typedef enum {
- THR_ZEROMV,
- THR_DC,
-
- THR_NEARESTMV,
- THR_NEARMV,
-
- THR_ZEROG,
- THR_NEARESTG,
-
- THR_ZEROA,
- THR_NEARESTA,
-
- THR_NEARG,
- THR_NEARA,
-
- THR_V_PRED,
- THR_H_PRED,
- THR_D45_PRED,
- THR_D135_PRED,
- THR_D117_PRED,
- THR_D153_PRED,
- THR_D27_PRED,
- THR_D63_PRED,
- THR_TM,
-
- THR_NEWMV,
- THR_NEWG,
- THR_NEWA,
-
- THR_SPLITMV,
- THR_SPLITG,
- THR_SPLITA,
-
- THR_B_PRED,
- THR_I8X8_PRED,
-
- THR_COMP_ZEROLG,
- THR_COMP_NEARESTLG,
- THR_COMP_NEARLG,
-
- THR_COMP_ZEROLA,
- THR_COMP_NEARESTLA,
- THR_COMP_NEARLA,
-
- THR_COMP_ZEROGA,
- THR_COMP_NEARESTGA,
- THR_COMP_NEARGA,
-
- THR_COMP_NEWLG,
- THR_COMP_NEWLA,
- THR_COMP_NEWGA,
-
- THR_COMP_SPLITLG,
- THR_COMP_SPLITLA,
- THR_COMP_SPLITGA
-}
-THR_MODES;
-#endif
-
-typedef enum {
- DIAMOND = 0,
- NSTEP = 1,
- HEX = 2
-} SEARCH_METHODS;
-
-typedef struct {
- int RD;
- SEARCH_METHODS search_method;
- int improved_dct;
- int auto_filter;
- int recode_loop;
- int iterative_sub_pixel;
- int half_pixel_search;
- int quarter_pixel_search;
- int thresh_mult[MAX_MODES];
- int max_step_search_steps;
- int first_step;
- int optimize_coefficients;
- int no_skip_block4x4_search;
- int improved_mv_pred;
- int search_best_filter;
-
-} SPEED_FEATURES;
-
-typedef struct {
- MACROBLOCK mb;
- int totalrate;
-} MB_ROW_COMP;
-
-typedef struct {
- TOKENEXTRA *start;
- TOKENEXTRA *stop;
-} TOKENLIST;
-
-typedef struct {
- int ithread;
- void *ptr1;
- void *ptr2;
-} ENCODETHREAD_DATA;
-typedef struct {
- int ithread;
- void *ptr1;
-} LPFTHREAD_DATA;
-
-
-typedef struct VP9_ENCODER_RTCD {
- VP9_COMMON_RTCD *common;
- vp9_search_rtcd_vtable_t search;
- vp9_temporal_rtcd_vtable_t temporal;
-} VP9_ENCODER_RTCD;
-
-enum BlockSize {
- BLOCK_16X8 = PARTITIONING_16X8,
- BLOCK_8X16 = PARTITIONING_8X16,
- BLOCK_8X8 = PARTITIONING_8X8,
- BLOCK_4X4 = PARTITIONING_4X4,
- BLOCK_16X16,
- BLOCK_MAX_SEGMENTS,
- BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
- BLOCK_MAX_SB_SEGMENTS,
-};
-
-typedef struct VP9_COMP {
-
- DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);
- DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);
- DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);
- DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);
- DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
- DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
-
- DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
- DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
- DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
-
- MACROBLOCK mb;
- VP9_COMMON common;
- VP9_CONFIG oxcf;
-
- struct lookahead_ctx *lookahead;
- struct lookahead_entry *source;
- struct lookahead_entry *alt_ref_source;
-
- YV12_BUFFER_CONFIG *Source;
- YV12_BUFFER_CONFIG *un_scaled_source;
- YV12_BUFFER_CONFIG scaled_source;
-
- int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
- int source_alt_ref_active; // an alt ref frame has been encoded and is usable
-
- int is_src_frame_alt_ref; // source of frame to encode is an exact copy of an alt ref frame
-
- int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
- int alt_is_last; // Alt reference frame same as last ( short circuit altref search)
- int gold_is_alt; // don't do both alt and gold search ( just do gold).
-
- // int refresh_alt_ref_frame;
- YV12_BUFFER_CONFIG last_frame_uf;
-
- TOKENEXTRA *tok;
- unsigned int tok_count;
-
-
- unsigned int frames_since_key;
- unsigned int key_frame_frequency;
- unsigned int this_key_frame_forced;
- unsigned int next_key_frame_forced;
-
- // Ambient reconstruction err target for force key frames
- int ambient_err;
-
- unsigned int mode_check_freq[MAX_MODES];
- unsigned int mode_test_hit_counts[MAX_MODES];
- unsigned int mode_chosen_counts[MAX_MODES];
-
- int rd_thresh_mult[MAX_MODES];
- int rd_baseline_thresh[MAX_MODES];
- int rd_threshes[MAX_MODES];
- int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
- int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
- int comp_pred_count[COMP_PRED_CONTEXTS];
- int single_pred_count[COMP_PRED_CONTEXTS];
- // FIXME contextualize
- int txfm_count[TX_SIZE_MAX];
- int txfm_count_8x8p[TX_SIZE_MAX - 1];
- int64_t rd_tx_select_diff[NB_TXFM_MODES];
- int rd_tx_select_threshes[4][NB_TXFM_MODES];
-
- int RDMULT;
- int RDDIV;
-
- CODING_CONTEXT coding_context;
-
- // Rate targetting variables
- int64_t prediction_error;
- int64_t last_prediction_error;
- int64_t intra_error;
- int64_t last_intra_error;
-
- int this_frame_target;
- int projected_frame_size;
- int last_q[2]; // Separate values for Intra/Inter
- int last_boosted_qindex; // Last boosted GF/KF/ARF q
-
- double rate_correction_factor;
- double key_frame_rate_correction_factor;
- double gf_rate_correction_factor;
-
- int frames_till_gf_update_due; // Count down till next GF
- int current_gf_interval; // GF interval chosen when we coded the last GF
-
- int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative)
-
- int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF
-
- int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames
- int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame.
- int max_gf_interval;
- int baseline_gf_interval;
- int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames
-
- int64_t key_frame_count;
- int prior_key_frame_distance[KEY_FRAME_CONTEXT];
- int per_frame_bandwidth; // Current section per frame bandwidth target
- int av_per_frame_bandwidth; // Average frame size target for clip
- int min_frame_bandwidth; // Minimum allocation that should be used for any frame
- int inter_frame_target;
- double output_frame_rate;
- int64_t last_time_stamp_seen;
- int64_t last_end_time_stamp_seen;
- int64_t first_time_stamp_ever;
-
- int ni_av_qi;
- int ni_tot_qi;
- int ni_frames;
- int avg_frame_qindex;
- double tot_q;
- double avg_q;
-
- int zbin_over_quant;
- int zbin_mode_boost;
- int zbin_mode_boost_enabled;
-
- int64_t total_byte_count;
-
- int buffered_mode;
-
- int buffer_level;
- int bits_off_target;
-
- int rolling_target_bits;
- int rolling_actual_bits;
-
- int long_rolling_target_bits;
- int long_rolling_actual_bits;
-
- int64_t total_actual_bits;
- int total_target_vs_actual; // debug stats
-
- int worst_quality;
- int active_worst_quality;
- int best_quality;
- int active_best_quality;
-
- int cq_target_quality;
-
-#if CONFIG_SUPERBLOCKS
- int sb_count;
- int sb_ymode_count [VP9_I32X32_MODES];
-#endif
- int ymode_count [VP9_YMODES]; /* intra MB type cts this frame */
- int bmode_count [VP9_BINTRAMODES];
- int i8x8_mode_count [VP9_I8X8_MODES];
- int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];
- int mbsplit_count [VP9_NUMMBSPLITS];
- // int uv_mode_count[VP9_UV_MODES]; /* intra MB type cts this frame */
- int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
-
- nmv_context_counts NMVcount;
-
- unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
- unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
- unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
- unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
- unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
- unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
- vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
- unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-
- int gfu_boost;
- int last_boost;
- int kf_boost;
- int kf_zeromotion_pct;
-
- int target_bandwidth;
- struct vpx_codec_pkt_list *output_pkt_list;
-
-#if 0
- // Experimental code for lagged and one pass
- ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
- int one_pass_frame_index;
-#endif
- MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
- int mbgraph_n_frames; // number of frames filled in the above
- int static_mb_pct; // % forced skip mbs by segmentation
- int seg0_progress, seg0_idx, seg0_cnt;
- int ref_pred_count[3][2];
-
- int decimation_factor;
- int decimation_count;
-
- // for real time encoding
- int avg_encode_time; // microsecond
- int avg_pick_mode_time; // microsecond
- int Speed;
- unsigned int cpu_freq; // Mhz
- int compressor_speed;
-
- int interquantizer;
- int goldfreq;
- int auto_worst_q;
- int cpu_used;
- int horiz_scale;
- int vert_scale;
- int pass;
-
- vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
- int last_skip_probs_q[3];
-
- int recent_ref_frame_usage[MAX_REF_FRAMES];
- int count_mb_ref_frame_usage[MAX_REF_FRAMES];
- int ref_frame_flags;
-
- unsigned char ref_pred_probs_update[PREDICTION_PROBS];
-
- SPEED_FEATURES sf;
- int error_bins[1024];
-
- // Data used for real time conferencing mode to help determine if it would be good to update the gf
- int inter_zz_count;
- int gf_bad_count;
- int gf_update_recommended;
- int skip_true_count[3];
- int skip_false_count[3];
-
- unsigned char *segmentation_map;
-
- // segment threashold for encode breakout
- int segment_encode_breakout[MAX_MB_SEGMENTS];
-
- unsigned char *active_map;
- unsigned int active_map_enabled;
-
- TOKENLIST *tplist;
-
- fractional_mv_step_fp *find_fractional_mv_step;
- vp9_full_search_fn_t full_search_sad;
- vp9_refining_search_fn_t refining_search_sad;
- vp9_diamond_search_fn_t diamond_search_sad;
- vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
- uint64_t time_receive_data;
- uint64_t time_compress_data;
- uint64_t time_pick_lpf;
- uint64_t time_encode_mb_row;
-
- int base_skip_false_prob[QINDEX_RANGE][3];
-
- struct twopass_rc {
- unsigned int section_intra_rating;
- unsigned int next_iiratio;
- unsigned int this_iiratio;
- FIRSTPASS_STATS *total_stats;
- FIRSTPASS_STATS *this_frame_stats;
- FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
- FIRSTPASS_STATS *total_left_stats;
- int first_pass_done;
- int64_t bits_left;
- int64_t clip_bits_total;
- double avg_iiratio;
- double modified_error_total;
- double modified_error_used;
- double modified_error_left;
- double kf_intra_err_min;
- double gf_intra_err_min;
- int frames_to_key;
- int maxq_max_limit;
- int maxq_min_limit;
- int static_scene_max_gf_interval;
- int kf_bits;
- int gf_group_error_left; // Remaining error from uncoded frames in a gf group. Two pass use only
-
- // Projected total bits available for a key frame group of frames
- int64_t kf_group_bits;
-
- // Error score of frames still to be coded in kf group
- int64_t kf_group_error_left;
-
- int gf_group_bits; // Projected Bits available for a group of frames including 1 GF or ARF
- int gf_bits; // Bits for the golden frame or ARF - 2 pass only
- int alt_extra_bits;
-
- int sr_update_lag;
- double est_max_qcorrection_factor;
- } twopass;
-
-#if CONFIG_RUNTIME_CPU_DETECT
- VP9_ENCODER_RTCD rtcd;
-#endif
-#if VP9_TEMPORAL_ALT_REF
- YV12_BUFFER_CONFIG alt_ref_buffer;
- YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
- int fixed_divide[512];
-#endif
-
-#if CONFIG_INTERNAL_STATS
- int count;
- double total_y;
- double total_u;
- double total_v;
- double total;
- double total_sq_error;
- double totalp_y;
- double totalp_u;
- double totalp_v;
- double totalp;
- double total_sq_error2;
- int bytes;
- double summed_quality;
- double summed_weights;
- unsigned int tot_recode_hits;
-
-
- double total_ssimg_y;
- double total_ssimg_u;
- double total_ssimg_v;
- double total_ssimg_all;
-
- int b_calculate_ssimg;
-#endif
- int b_calculate_psnr;
-
- // Per MB activity measurement
- unsigned int activity_avg;
- unsigned int *mb_activity_map;
- int *mb_norm_activity_map;
-
- // Record of which MBs still refer to last golden frame either
- // directly or through 0,0
- unsigned char *gf_active_flags;
- int gf_active_count;
-
- int output_partition;
-
- // Store last frame's MV info for next frame MV prediction
- int_mv *lfmv;
- int *lf_ref_frame_sign_bias;
- int *lf_ref_frame;
-
- /* force next frame to intra when kf_auto says so */
- int force_next_frame_intra;
-
- int droppable;
-
- // TODO Do we still need this??
- int update_context;
-
- int dummy_packing; /* flag to indicate if packing is dummy */
-
-#if CONFIG_PRED_FILTER
- int pred_filter_on_count;
- int pred_filter_off_count;
-#endif
- unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS];
-
-#if CONFIG_NEW_MVREF
- unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];
-#endif
-
-} VP9_COMP;
-
-void vp9_encode_frame(VP9_COMP *cpi);
-
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
- unsigned long *size);
-
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
-
-void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
-
-void vp9_set_speed_features(VP9_COMP *cpi);
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
-#endif // __INC_ONYX_INT_H
--- a/vp8/encoder/picklpf.c
+++ /dev/null
@@ -1,420 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/loopfilter.h"
-#if ARCH_ARM
-#include "vpx_ports/arm.h"
-#endif
-
-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest);
-#if HAVE_ARMV7
-extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc,
- int fraction);
-
-void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
- unsigned char *src_y, *dst_y;
- int yheight;
- int ystride;
- int border;
- int yoffset;
- int linestocopy;
-
- border = src_ybc->border;
- yheight = src_ybc->y_height;
- ystride = src_ybc->y_stride;
-
- linestocopy = (yheight >> (Fraction + 4));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
- yoffset = ystride * ((yheight >> 5) * 16 - 8);
- src_y = src_ybc->y_buffer + yoffset;
- dst_y = dst_ybc->y_buffer + yoffset;
-
- vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
-}
-
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest, int Fraction) {
- int i, j;
- int Total = 0;
- int srcoffset, dstoffset;
- unsigned char *src = source->y_buffer;
- unsigned char *dst = dest->y_buffer;
-
- int linestocopy = (source->y_height >> (Fraction + 4));
-
- if (linestocopy < 1)
- linestocopy = 1;
-
- linestocopy <<= 4;
-
-
- srcoffset = source->y_stride * (dest->y_height >> 5) * 16;
- dstoffset = dest->y_stride * (dest->y_height >> 5) * 16;
-
- src += srcoffset;
- dst += dstoffset;
-
- // Loop through the Y plane raw and reconstruction data summing (square differences)
- for (i = 0; i < linestocopy; i += 16) {
- for (j = 0; j < source->y_width; j += 16) {
- unsigned int sse;
- Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
- &sse);
- }
-
- src += 16 * source->y_stride;
- dst += 16 * dest->y_stride;
- }
-
- return Total;
-}
-
-// Enforce a minimum filter level based upon baseline Q
-static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
- int min_filter_level;
- /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
- if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
- min_filter_level = 0;
- else
- {
- if (q <= 10)
- min_filter_level = 0;
- else if (q <= 64)
- min_filter_level = 1;
- else
- min_filter_level = (q >> 6);
- }
- */
- min_filter_level = 0;
-
- return min_filter_level;
-}
-
-// Enforce a maximum filter level based upon baseline Q
-static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
- // PGW August 2006: Highest filter values almost always a bad idea
-
- // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
- // with lots of intra coming in.
- int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
- (void)base_qindex;
-
- if (cpi->twopass.section_intra_rating > 8)
- max_filter_level = MAX_LOOP_FILTER * 3 / 4;
-
- return max_filter_level;
-}
-
-void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- int best_err = 0;
- int filt_err = 0;
- int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
- int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
- int filt_val;
- int best_filt_val = cm->filter_level;
-
- // Make a copy of the unfiltered / processed recon buffer
- vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
-
- if (cm->frame_type == KEY_FRAME)
- cm->sharpness_level = 0;
- else
- cm->sharpness_level = cpi->oxcf.Sharpness;
-
- if (cm->sharpness_level != cm->last_sharpness_level) {
- vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
- cm->last_sharpness_level = cm->sharpness_level;
- }
-
- // Start the search at the previous frame filter level unless it is now out of range.
- if (cm->filter_level < min_filter_level)
- cm->filter_level = min_filter_level;
- else if (cm->filter_level > max_filter_level)
- cm->filter_level = max_filter_level;
-
- filt_val = cm->filter_level;
- best_filt_val = filt_val;
-
- // Get the err using the previous frame's filter value.
- vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
- best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
- // Re-instate the unfiltered frame
- vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
- filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
-
- // Search lower filter levels
- while (filt_val >= min_filter_level) {
- // Apply the loop filter
- vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
- // Get the err for filtered frame
- filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
- // Re-instate the unfiltered frame
- vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
-
- // Update the best case record or exit loop.
- if (filt_err < best_err) {
- best_err = filt_err;
- best_filt_val = filt_val;
- } else
- break;
-
- // Adjust filter level
- filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
- }
-
- // Search up (note that we have already done filt_val = cm->filter_level)
- filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
-
- if (best_filt_val == cm->filter_level) {
- // Resist raising filter level for very small gains
- best_err -= (best_err >> 10);
-
- while (filt_val < max_filter_level) {
- // Apply the loop filter
- vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
- // Get the err for filtered frame
- filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
- // Re-instate the unfiltered frame
- vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,
- cm->frame_to_show, 3);
-
- // Update the best case record or exit loop.
- if (filt_err < best_err) {
- // Do not raise filter level if improvement is < 1 part in 4096
- best_err = filt_err - (filt_err >> 10);
-
- best_filt_val = filt_val;
- } else
- break;
-
- // Adjust filter level
- filt_val += (1 + ((filt_val > 10) ? 1 : 0));
- }
- }
-
- cm->filter_level = best_filt_val;
-
- if (cm->filter_level < min_filter_level)
- cm->filter_level = min_filter_level;
-
- if (cm->filter_level > max_filter_level)
- cm->filter_level = max_filter_level;
-}
-
-// Stub function for now Alt LF not used
-void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
-}
-
-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- int best_err = 0;
- int filt_err = 0;
- int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
- int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-
- int filter_step;
- int filt_high = 0;
- int filt_mid = cm->filter_level; // Start search at previous frame filter level
- int filt_low = 0;
- int filt_best;
- int filt_direction = 0;
-
- int Bias = 0; // Bias against raising loop filter and in favour of lowering it
-
- // Make a copy of the unfiltered / processed recon buffer
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
- }
-#if CONFIG_RUNTIME_CPU_DETECT
- else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
- {
- vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
- }
-#endif
-
- if (cm->frame_type == KEY_FRAME)
- cm->sharpness_level = 0;
- else
- cm->sharpness_level = cpi->oxcf.Sharpness;
-
- // Start the search at the previous frame filter level unless it is now out of range.
- filt_mid = cm->filter_level;
-
- if (filt_mid < min_filter_level)
- filt_mid = min_filter_level;
- else if (filt_mid > max_filter_level)
- filt_mid = max_filter_level;
-
- // Define the initial step size
- filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
-
- // Get baseline error score
- vp9_set_alt_lf_level(cpi, filt_mid);
- vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
-
- best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
- filt_best = filt_mid;
-
- // Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#if CONFIG_RUNTIME_CPU_DETECT
- else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
- {
- vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#endif
-
- while (filter_step > 0) {
- Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
-
- // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
- if (cpi->twopass.section_intra_rating < 20)
- Bias = Bias * cpi->twopass.section_intra_rating / 20;
-
- // yx, bias less for large block size
- if (cpi->common.txfm_mode != ONLY_4X4)
- Bias >>= 1;
-
- filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
- filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
-
- if ((filt_direction <= 0) && (filt_low != filt_mid)) {
- // Get Low filter error score
- vp9_set_alt_lf_level(cpi, filt_low);
- vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
-
- filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
- // Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#if CONFIG_RUNTIME_CPU_DETECT
- else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
- {
- vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#endif
-
- // If value is close to the best so far then bias towards a lower loop filter value.
- if ((filt_err - Bias) < best_err) {
- // Was it actually better than the previous best?
- if (filt_err < best_err)
- best_err = filt_err;
-
- filt_best = filt_low;
- }
- }
-
- // Now look at filt_high
- if ((filt_direction >= 0) && (filt_high != filt_mid)) {
- vp9_set_alt_lf_level(cpi, filt_high);
- vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
-
- filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
- // Re-instate the unfiltered frame
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (cm->rtcd.flags & HAS_NEON)
-#endif
- {
- vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#if CONFIG_RUNTIME_CPU_DETECT
- else
-#endif
-#endif
-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
- {
- vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
- }
-#endif
-
- // Was it better than the previous best?
- if (filt_err < (best_err - Bias)) {
- best_err = filt_err;
- filt_best = filt_high;
- }
- }
-
- // Half the step distance if the best filter value was the same as last time
- if (filt_best == filt_mid) {
- filter_step = filter_step / 2;
- filt_direction = 0;
- } else {
- filt_direction = (filt_best < filt_mid) ? -1 : 1;
- filt_mid = filt_best;
- }
- }
-
- cm->filter_level = filt_best;
-}
-
--- a/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null
@@ -1,155 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp9_sad16x16;
-SADFunction *vp9_sad16x8;
-SADFunction *vp9_sad8x16;
-SADFunction *vp9_sad8x8;
-SADFunction *vp9_sad4x4;
-
-variance_function *vp9_variance4x4;
-variance_function *vp9_variance8x8;
-variance_function *vp9_variance8x16;
-variance_function *vp9_variance16x8;
-variance_function *vp9_variance16x16;
-
-variance_function *vp9_mse16x16;
-
-sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
-
-int (*vp9_block_error)(short *coeff, short *dqcoeff);
-int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp9_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp9_get_mb_ss)(short *);
-void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp9_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp9_get_mb_ss_c(short *);
-
-// ppc
-extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp9_sad16x16_ppc;
-extern SADFunction vp9_sad16x8_ppc;
-extern SADFunction vp9_sad8x16_ppc;
-extern SADFunction vp9_sad8x8_ppc;
-extern SADFunction vp9_sad4x4_ppc;
-
-extern variance_function vp9_variance16x16_ppc;
-extern variance_function vp9_variance8x16_ppc;
-extern variance_function vp9_variance16x8_ppc;
-extern variance_function vp9_variance8x8_ppc;
-extern variance_function vp9_variance4x4_ppc;
-extern variance_function vp9_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-
-void vp9_cmachine_specific_config(void) {
- // Pure C:
- vp9_mbuverror = vp9_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
- vp9_short_fdct4x4 = vp9_short_fdct4x4_ppc;
- vp9_short_fdct8x4 = vp9_short_fdct8x4_ppc;
- vp8_fast_fdct4x4 = vp9_short_fdct4x4_ppc;
- vp8_fast_fdct8x4 = vp9_short_fdct8x4_ppc;
- short_walsh4x4 = vp9_short_walsh4x4_c;
-
- vp9_variance4x4 = vp9_variance4x4_ppc;
- vp9_variance8x8 = vp9_variance8x8_ppc;
- vp9_variance8x16 = vp9_variance8x16_ppc;
- vp9_variance16x8 = vp9_variance16x8_ppc;
- vp9_variance16x16 = vp9_variance16x16_ppc;
- vp9_mse16x16 = vp9_mse16x16_ppc;
-
- vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ppc;
- vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ppc;
- vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ppc;
- vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ppc;
- vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ppc;
-
- vp9_get_mb_ss = vp9_get_mb_ss_c;
-
- vp9_sad16x16 = vp9_sad16x16_ppc;
- vp9_sad16x8 = vp9_sad16x8_ppc;
- vp9_sad8x16 = vp9_sad8x16_ppc;
- vp9_sad8x8 = vp9_sad8x8_ppc;
- vp9_sad4x4 = vp9_sad4x4_ppc;
-
- vp9_block_error = vp9_block_error_ppc;
- vp9_mbblock_error = vp9_mbblock_error_c;
-
- vp9_subtract_b = vp9_subtract_b_c;
- vp9_subtract_mby = vp9_subtract_mby_ppc;
- vp9_subtract_mbuv = vp9_subtract_mbuv_ppc;
-}
--- a/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_subtract_mbuv_ppc
- .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- li r9, 256
- add r3, r3, r9
- add r3, r3, r9
- add r6, r6, r9
-
- li r10, 16
- li r9, 4
- mtctr r9
-
- vspltisw v0, 0
-
-mbu_loop:
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r4, r4, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
-
- add r4, r4, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbu_loop
-
- mtctr r9
-
-mbv_loop:
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r5, r5, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
-
- add r5, r5, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbv_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
- vspltisw v0, 0
-
-mby_loop:
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r5 ;# pred
-
- add r4, r4, r6
- addi r5, r5, 16
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vmrglb v3, v0, v1 ;# unpack low src to short
- vmrglb v4, v0, v2 ;# unpack low pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mby_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
--- a/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null
@@ -1,205 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_short_fdct4x4_ppc
- .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;# in normalization (fwd is twice unitary, inv is half unitary)
-;# and that they are of course transposes of each other.
-;#
-;# The following three accomplish most of implementation and
-;# are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- li r6, 16
-
- load_c v0, dct_tab, 0, r9, r10
- lvx v1, r6, r10
- addi r10, r10, 32
- lvx v2, 0, r10
- lvx v3, r6, r10
-
- load_c v4, ppc_dctperm_tab, 0, r9, r10
- load_c v5, ppc_dctperm_tab, r6, r9, r10
-
- load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
-;# a/A are the even rows 0,2 b/B are the odd rows 1,3
-;# For fwd transform, indices are horizontal positions, then frequencies.
-;# For inverse transform, frequencies then positions.
-;# The two resulting A0..A3 B0..B3 are later combined
-;# and vertically transformed.
-
-.macro two_rows_horiz Dst
- vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
-
- vmsumshm v10, v0, v8, v6
- vmsumshm v10, v1, v9, v10
- vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
-
- vmsumshm v11, v2, v8, v6
- vmsumshm v11, v3, v9, v11
- vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
-
- vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
- vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;# forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
- vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
- vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v10, v8, v7
-
- vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
- vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v8, v8, v7
-
- vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
-.endm
-
-.macro two_rows_h Dest
- stw r0, 0(r8)
- lwz r0, 4(r3)
- stw r0, 4(r8)
- lwzux r0, r3,r5
- stw r0, 8(r8)
- lwz r0, 4(r3)
- stw r0, 12(r8)
- lvx v8, 0,r8
- two_rows_horiz \Dest
-.endm
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
- addi r10, r3, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- ;# Next block
- addi r3, r10, 8
- addi r4, r4, 32
- lvx v6, 0, r9 ;# v6 = Hround
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .data
- .align 4
-ppc_dctperm_tab:
- .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
- .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
- .align 4
-dct_tab:
- .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
- .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
- .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
- .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
- .align 4
-round_tab:
- .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
- .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
--- a/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_block_error_ppc
-
- .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- stw r5, 12(r1) ;# tranfer dc to vector register
-
- lvx v0, 0, r3 ;# Coeff
- lvx v1, 0, r4 ;# dqcoeff
-
- li r10, 16
-
- vspltisw v3, 0
-
- vsubshs v0, v0, v1
-
- vmsumshm v2, v0, v0, v3 ;# multiply differences
-
- lvx v0, r10, r3 ;# Coeff
- lvx v1, r10, r4 ;# dqcoeff
-
- vsubshs v0, v0, v1
-
- vmsumshm v1, v0, v0, v2 ;# multiply differences
- vsumsws v1, v1, v3 ;# sum up
-
- stvx v1, 0, r1
- lwz r3, 12(r1) ;# return value
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
--- a/vp8/encoder/ppc/sad_altivec.asm
+++ /dev/null
@@ -1,277 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_sad16x16_ppc
- .globl vp8_sad16x8_ppc
- .globl vp8_sad8x16_ppc
- .globl vp8_sad8x8_ppc
- .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v8, 0 ;# zero out total to start
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
- lvsl v3, 0, r5 ;# only needs to be done once per block
-
- ;# preload a line of data before getting into the loop
- lvx v4, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- add r5, r5, r6
- add r3, r3, r4
-
- vperm v5, v1, v2, v3
-
- .align 4
-\loop_label:
- ;# compute difference on first row
- vsububs v6, v4, v5
- vsububs v7, v5, v4
-
- ;# load up next set of data
- lvx v9, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- ;# perform abs() of difference
- vor v6, v6, v7
- add r3, r3, r4
-
- ;# add to the running tally
- vsum4ubs v8, v6, v8
-
- ;# now onto the next line
- vperm v5, v1, v2, v3
- add r5, r5, r6
- lvx v4, 0, r3
-
- ;# compute difference on second row
- vsububs v6, v9, v5
- lvx v1, 0, r5
- vsububs v7, v5, v9
- lvx v2, r10, r5
- vor v6, v6, v7
- add r3, r3, r4
- vsum4ubs v8, v6, v8
- vperm v5, v1, v2, v3
- add r5, r5, r6
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
- .align 4
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v7, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v7
-
- SAD_16
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_16_loop sad16x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_16_loop sad16x8_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_8_loop sad8x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_8_loop sad8x8_loop
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r7, 0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r7, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- vspltisw v8, 0 ;# zero out total to start
-
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v7, v6, v8
- vsumsws v7, v7, v8
-
- stvx v7, 0, r1
- lwz r3, 12(r1)
-
- epilogue
-
- blr
--- a/vp8/encoder/ppc/variance_altivec.asm
+++ /dev/null
@@ -1,375 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_get8x8var_ppc
- .globl vp8_get16x16var_ppc
- .globl vp8_mse16x16_ppc
- .globl vp9_variance16x16_ppc
- .globl vp9_variance16x8_ppc
- .globl vp9_variance8x16_ppc
- .globl vp9_variance8x8_ppc
- .globl vp9_variance4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v7, 0 ;# zero for merging
- vspltisw v8, 0 ;# zero out total to start
- vspltisw v9, 0 ;# zero out total for dif^2
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb v2, v7, v4
- vmrghb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- vmrglb v2, v7, v4
- vmrglb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> DS
- subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v0, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v0
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, get8x8var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, get16x16var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
- prologue
-
- mtctr r10
-
-mse16x16_loop:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-
- bdnz mse16x16_loop
-
- vsumsws v9, v9, v7
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stw r3, 0(r7) ;# sse
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x16_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, variance16x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x8_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_16 7, variance16x8_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_8 7, variance8x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, variance8x8_loop, 0
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r10,0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r10, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- compute_sum_sse
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, 4 ;# (sum*sum) >> 4
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
-
- epilogue
-
- blr
--- a/vp8/encoder/ppc/variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp9_sub_pixel_variance4x4_ppc
- .globl vp9_sub_pixel_variance8x8_ppc
- .globl vp9_sub_pixel_variance8x16_ppc
- .globl vp9_sub_pixel_variance16x8_ppc
- .globl vp9_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r12, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r12, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r12, r0
-
- ;# index to the next set of vectors in the row.
- li r12, 32
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb \t1, \z0, \src
- vmrghb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- vmrglb \t1, \z0, \src
- vmrglb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- ;# Now compute sse.
- vsububs \t1, \src, \ref
- vsububs \t2, \ref, \src
- vor \t1, \t1, \t2
-
- vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
- vsumsws \sum, \sum, \z0
- vsumsws \sse, \sse, \z0
-
- stvx \sum, 0, r1
- lwz r3, 12(r1)
-
- stvx \sse, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r9) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srawi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
- load_and_align_16 v16, r7, r8, \increment_counter
- compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
- lvsl v17, 0, \R ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, \R
- lvx v22, r10, \R
-
-.if \increment_counter
- add \R, \R, \P
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_4x4_b
-
- hfilter_8 v4, v10, v11, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-compute_sum_sse_4x4_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
-
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_c v10, b_hilo_b, 0, r12, r0
-
- vperm v0, v0, v1, v10
- vperm v1, v2, v3, v10
-
- compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 4
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
- hfilter_8 v4, v10, v11, 1
- hfilter_8 v5, v10, v11, 1
- hfilter_8 v6, v10, v11, 1
- hfilter_8 v7, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x8_b
-
- hfilter_8 v8, v10, v11, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 0
-
- beq compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 0
-
- vmrghb v4, v4, v5
- vmrghb v5, v6, v7
- vmrghb v6, v8, v9
- vmrghb v7, v10, v11
-
- compute_sum_sse v0, v4, v18, v19, v20, v21, v23
- compute_sum_sse v1, v5, v18, v19, v20, v21, v23
- compute_sum_sse v2, v6, v18, v19, v20, v21, v23
- compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 6
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x16_pre_copy_b
-
- ;# Load up permutation constants
- load_c v29, b_0123_b, 0, r12, r0
- load_c v30, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v29, v30, 1
- hfilter_8 v1, v29, v30, 1
- hfilter_8 v2, v29, v30, 1
- hfilter_8 v3, v29, v30, 1
- hfilter_8 v4, v29, v30, 1
- hfilter_8 v5, v29, v30, 1
- hfilter_8 v6, v29, v30, 1
- hfilter_8 v7, v29, v30, 1
- hfilter_8 v8, v29, v30, 1
- hfilter_8 v9, v29, v30, 1
- hfilter_8 v10, v29, v30, 1
- hfilter_8 v11, v29, v30, 1
- hfilter_8 v12, v29, v30, 1
- hfilter_8 v13, v29, v30, 1
- hfilter_8 v14, v29, v30, 1
- hfilter_8 v15, v29, v30, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x16_b
-
- hfilter_8 v16, v29, v30, 0
-
- b second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
- vmrghb v4, v8, v9
- vmrghb v5, v10, v11
- vmrghb v6, v12, v13
- vmrghb v7, v14, v15
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 1
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v0, v8, v18, v19, v20, v21, v23
- compute_sum_sse v1, v9, v18, v19, v20, v21, v23
- compute_sum_sse v2, v10, v18, v19, v20, v21, v23
- compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 0
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v4, v8, v18, v19, v20, v21, v23
- compute_sum_sse v5, v9, v18, v19, v20, v21, v23
- compute_sum_sse v6, v10, v18, v19, v20, v21, v23
- compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x8_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x8_b
-
- hfilter_16 v8, 0
-
- b second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
-
- beq compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_16x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 0
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 1
- compute_sum_sse_16 v8, 1
- compute_sum_sse_16 v9, 1
- compute_sum_sse_16 v10, 1
- compute_sum_sse_16 v11, 1
- compute_sum_sse_16 v12, 1
- compute_sum_sse_16 v13, 1
- compute_sum_sse_16 v14, 1
- compute_sum_sse_16 v15, 0
-
- variance_final v18, v19, v23, 8
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp8/encoder/psnr.c
+++ /dev/null
@@ -1,30 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "math.h"
-#include "vp8/common/systemdependent.h" /* for vp9_clear_system_state() */
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double Samples, double Peak, double Mse) {
- double psnr;
-
- if ((double)Mse > 0.0)
- psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
- else
- psnr = MAX_PSNR; // Limit to prevent / 0
-
- if (psnr > MAX_PSNR)
- psnr = MAX_PSNR;
-
- return psnr;
-}
--- a/vp8/encoder/psnr.h
+++ /dev/null
@@ -1,17 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PSNR_H
-#define __INC_PSNR_H
-
-extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
-
-#endif
--- a/vp8/encoder/quantize.c
+++ /dev/null
@@ -1,716 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-
-#include "onyx_int.h"
-#include "quantize.h"
-#include "vp8/common/quant_common.h"
-
-#include "vp8/common/seg_common.h"
-
-#ifdef ENC_DEBUG
-extern int enc_debug;
-#endif
-
-void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- short *zbin_boost_ptr = b->zrun_zbin_boost;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant;
- unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
- short zbin_oq_value = b->zbin_extra;
-
- int const *pt_scan ;
-
- switch (tx_type) {
- case ADST_DCT :
- pt_scan = vp9_row_scan;
- break;
-
- case DCT_ADST :
- pt_scan = vp9_col_scan;
- break;
-
- default :
- pt_scan = vp9_default_zig_zag1d;
- break;
- }
-
- vpx_memset(qcoeff_ptr, 0, 32);
- vpx_memset(dqcoeff_ptr, 0, 32);
-
- eob = -1;
-
- for (i = 0; i < b->eob_max_offset; i++) {
- rc = pt_scan[i];
- z = coeff_ptr[rc];
-
- zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
- zbin_boost_ptr ++;
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += round_ptr[rc];
- y = (((x * quant_ptr[rc]) >> 16) + x)
- >> quant_shift_ptr[rc]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength
- }
- }
- }
-
- d->eob = eob + 1;
-}
-
-void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- short *zbin_boost_ptr = b->zrun_zbin_boost;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant;
- unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
- short zbin_oq_value = b->zbin_extra;
-
- vpx_memset(qcoeff_ptr, 0, 32);
- vpx_memset(dqcoeff_ptr, 0, 32);
-
- eob = -1;
-
- for (i = 0; i < b->eob_max_offset; i++) {
- rc = vp9_default_zig_zag1d[i];
- z = coeff_ptr[rc];
-
- zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
- zbin_boost_ptr ++;
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += round_ptr[rc];
-
- y = (((x * quant_ptr[rc]) >> 16) + x)
- >> quant_shift_ptr[rc]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength
- }
- }
- }
-
- d->eob = eob + 1;
-}
-
-void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
- int i;
- int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
- for (i = 0; i < 16; i++)
- x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
-
- if (has_2nd_order)
- x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
- int i;
-
- for (i = 16; i < 24; i++)
- x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
-}
-
-void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
- vp9_quantize_mby_4x4_c(x);
- vp9_quantize_mbuv_4x4_c(x);
-}
-
-void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- short *zbin_boost_ptr = b->zrun_zbin_boost;
- int zbin_zrun_index = 0;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant;
- unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
- short zbin_oq_value = b->zbin_extra;
- // double q2nd = 4;
- vpx_memset(qcoeff_ptr, 0, 32);
- vpx_memset(dqcoeff_ptr, 0, 32);
-
- eob = -1;
-
- for (i = 0; i < b->eob_max_offset_8x8; i++) {
- rc = vp9_default_zig_zag1d[i];
- z = coeff_ptr[rc];
-
- zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
- zbin_zrun_index += 4;
- zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += (round_ptr[rc]);
- y = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
- >> quant_shift_ptr[rc]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zbin_zrun_index = 0;
- }
- }
- }
-
- d->eob = eob + 1;
-}
-
-void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin_8x8;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant;
- unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
- short zbin_oq_value = b->zbin_extra;
-
- vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));
- vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));
-
- eob = -1;
-
- for (i = 0; i < b->eob_max_offset_8x8; i++) {
- rc = vp9_default_zig_zag1d_8x8[i];
- z = coeff_ptr[rc];
-
- zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);
- zbin_boost_ptr++;
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += (round_ptr[rc != 0]);
- y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
- >> quant_shift_ptr[rc != 0]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zbin_boost_ptr = b->zrun_zbin_boost_8x8;
- }
- }
- }
-
- d->eob = eob + 1;
-}
-
-void vp9_quantize_mby_8x8(MACROBLOCK *x) {
- int i;
- int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
-
- for (i = 0; i < 16; i ++) {
- x->e_mbd.block[i].eob = 0;
- }
- x->e_mbd.block[24].eob = 0;
- for (i = 0; i < 16; i += 4)
- x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
-
- if (has_2nd_order)
- x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
- int i;
-
- for (i = 16; i < 24; i ++)
- x->e_mbd.block[i].eob = 0;
- for (i = 16; i < 24; i += 4)
- x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
-}
-
-void vp9_quantize_mb_8x8(MACROBLOCK *x) {
- vp9_quantize_mby_8x8(x);
- vp9_quantize_mbuv_8x8(x);
-}
-
-void vp9_quantize_mby_16x16(MACROBLOCK *x) {
- int i;
-
- for (i = 0; i < 16; i++)
- x->e_mbd.block[i].eob = 0;
- x->e_mbd.block[24].eob = 0;
- x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
-}
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x) {
- vp9_quantize_mby_16x16(x);
- vp9_quantize_mbuv_8x8(x);
-}
-
-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
- int i, rc, eob;
- int zbin;
- int x, y, z, sz;
- short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin_16x16;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant;
- unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
- short zbin_oq_value = b->zbin_extra;
-
- vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
- vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
-
- eob = -1;
- for (i = 0; i < b->eob_max_offset_16x16; i++) {
- rc = vp9_default_zig_zag1d_16x16[i];
- z = coeff_ptr[rc];
-
- zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
- zbin_boost_ptr ++;
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
-
- if (x >= zbin) {
- x += (round_ptr[rc!=0]);
- y = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
- >> quant_shift_ptr[rc!=0]; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- zbin_boost_ptr = b->zrun_zbin_boost_16x16;
- }
- }
- }
-
- d->eob = eob + 1;
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,
- BLOCKD *d1, BLOCKD *d2) {
- vp9_regular_quantize_b_4x4(b1, d1);
- vp9_regular_quantize_b_4x4(b2, d2);
-}
-
-static void invert_quant(short *quant,
- unsigned char *shift, short d) {
- unsigned t;
- int l;
- t = d;
- for (l = 0; t > 1; l++)
- t >>= 1;
- t = 1 + (1 << (16 + l)) / d;
- *quant = (short)(t - (1 << 16));
- *shift = l;
-}
-
-void vp9_init_quantizer(VP9_COMP *cpi) {
- int i;
- int quant_val;
- int Q;
- static const int zbin_boost[16] = { 0, 0, 8, 10, 12, 14, 16, 20,
- 24, 28, 32, 36, 40, 44, 44, 44
- };
-
- static const int zbin_boost_8x8[64] = { 0, 0, 0, 8, 8, 8, 10, 12,
- 14, 16, 18, 20, 22, 24, 26, 28,
- 30, 32, 34, 36, 38, 40, 42, 44,
- 46, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48
- };
- static const int zbin_boost_16x16[256] = {
- 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
- 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
- };
- int qrounding_factor = 48;
-
-
- for (Q = 0; Q < QINDEX_RANGE; Q++) {
- int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
-
-#if CONFIG_LOSSLESS
- if (cpi->oxcf.lossless) {
- if (Q == 0) {
- qzbin_factor = 64;
- qrounding_factor = 64;
- }
- }
-#endif
-
- // dc values
- quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
- invert_quant(cpi->Y1quant[Q] + 0,
- cpi->Y1quant_shift[Q] + 0, quant_val);
- cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.Y1dequant[Q][0] = quant_val;
- cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
- cpi->zrun_zbin_boost_y1_8x8[Q][0] =
- ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
- cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
-
- quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
- invert_quant(cpi->Y2quant[Q] + 0,
- cpi->Y2quant_shift[Q] + 0, quant_val);
- cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.Y2dequant[Q][0] = quant_val;
- cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
- cpi->zrun_zbin_boost_y2_8x8[Q][0] =
- ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
- cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
- quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
- invert_quant(cpi->UVquant[Q] + 0,
- cpi->UVquant_shift[Q] + 0, quant_val);
- cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
- cpi->common.UVdequant[Q][0] = quant_val;
- cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
- cpi->zrun_zbin_boost_uv_8x8[Q][0] =
- ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
- cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
-
- // all the 4x4 ac values =;
- for (i = 1; i < 16; i++) {
- int rc = vp9_default_zig_zag1d[i];
-
- quant_val = vp9_ac_yquant(Q);
- invert_quant(cpi->Y1quant[Q] + rc,
- cpi->Y1quant_shift[Q] + rc, quant_val);
- cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;
- cpi->common.Y1dequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_y1[Q][i] =
- ((quant_val * zbin_boost[i]) + 64) >> 7;
-
- quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
- invert_quant(cpi->Y2quant[Q] + rc,
- cpi->Y2quant_shift[Q] + rc, quant_val);
- cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;
- cpi->common.Y2dequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_y2[Q][i] =
- ((quant_val * zbin_boost[i]) + 64) >> 7;
-
- quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
- invert_quant(cpi->UVquant[Q] + rc,
- cpi->UVquant_shift[Q] + rc, quant_val);
- cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;
- cpi->common.UVdequant[Q][rc] = quant_val;
- cpi->zrun_zbin_boost_uv[Q][i] =
- ((quant_val * zbin_boost[i]) + 64) >> 7;
- }
-
- // 8x8 structures... only zbin seperated out for now
- // This needs cleaning up for 8x8 especially if we are to add
- // support for non flat Q matices
- for (i = 1; i < 64; i++) {
- int rc = vp9_default_zig_zag1d_8x8[i];
-
- quant_val = vp9_ac_yquant(Q);
- cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_y1_8x8[Q][i] =
- ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
- quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
- cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_y2_8x8[Q][i] =
- ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
-
- quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
- cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_uv_8x8[Q][i] =
- ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
- }
-
- // 16x16 structures. Same comment above applies.
- for (i = 1; i < 256; i++) {
- int rc = vp9_default_zig_zag1d_16x16[i];
-
- quant_val = vp9_ac_yquant(Q);
- cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
- quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
- cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
-
- quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
- cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
- cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
- }
- }
-}
-
-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
- int i;
- int QIndex;
- MACROBLOCKD *xd = &x->e_mbd;
- int zbin_extra;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
-
- // Select the baseline MB Q index allowing for any segment level change.
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
- // Abs Value
- if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
- QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
- // Delta Value
- else {
- QIndex = cpi->common.base_qindex +
- vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
- // Clamp to valid range
- QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
- }
- } else
- QIndex = cpi->common.base_qindex;
-
- // Y
- zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
- (cpi->zbin_over_quant +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
-
- for (i = 0; i < 16; i++) {
- x->block[i].quant = cpi->Y1quant[QIndex];
- x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
- x->block[i].zbin = cpi->Y1zbin[QIndex];
- x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
- x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
- x->block[i].round = cpi->Y1round[QIndex];
- x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
- x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
- x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
- x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
- x->block[i].zbin_extra = (short)zbin_extra;
-
- // Segment max eob offset feature.
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
- x->block[i].eob_max_offset =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- x->block[i].eob_max_offset_8x8 =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- x->block[i].eob_max_offset_16x16 =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- } else {
- x->block[i].eob_max_offset = 16;
- x->block[i].eob_max_offset_8x8 = 64;
- x->block[i].eob_max_offset_16x16 = 256;
- }
- }
-
- // UV
- zbin_extra = (cpi->common.UVdequant[QIndex][1] *
- (cpi->zbin_over_quant +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
-
- for (i = 16; i < 24; i++) {
- x->block[i].quant = cpi->UVquant[QIndex];
- x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
- x->block[i].zbin = cpi->UVzbin[QIndex];
- x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
- x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
- x->block[i].round = cpi->UVround[QIndex];
- x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
- x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
- x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
- x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
-
- x->block[i].zbin_extra = (short)zbin_extra;
-
- // Segment max eob offset feature.
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
- x->block[i].eob_max_offset =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- x->block[i].eob_max_offset_8x8 =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- } else {
- x->block[i].eob_max_offset = 16;
- x->block[i].eob_max_offset_8x8 = 64;
- }
- }
-
- // Y2
- zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
- ((cpi->zbin_over_quant / 2) +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
-
- x->block[24].quant = cpi->Y2quant[QIndex];
- x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
- x->block[24].zbin = cpi->Y2zbin[QIndex];
- x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
- x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
- x->block[24].round = cpi->Y2round[QIndex];
- x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
- x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
- x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
- x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
- x->block[24].zbin_extra = (short)zbin_extra;
-
- // TBD perhaps not use for Y2
- // Segment max eob offset feature.
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
- x->block[24].eob_max_offset =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- x->block[24].eob_max_offset_8x8 =
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
- } else {
- x->block[24].eob_max_offset = 16;
- x->block[24].eob_max_offset_8x8 = 4;
- }
-
- /* save this macroblock QIndex for vp9_update_zbin_extra() */
- x->e_mbd.q_index = QIndex;
-}
-
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
- int i;
- int QIndex = x->e_mbd.q_index;
- int zbin_extra;
-
- // Y
- zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
- (cpi->zbin_over_quant +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
- for (i = 0; i < 16; i++) {
- x->block[i].zbin_extra = (short)zbin_extra;
- }
-
- // UV
- zbin_extra = (cpi->common.UVdequant[QIndex][1] *
- (cpi->zbin_over_quant +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
-
- for (i = 16; i < 24; i++) {
- x->block[i].zbin_extra = (short)zbin_extra;
- }
-
- // Y2
- zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
- ((cpi->zbin_over_quant / 2) +
- cpi->zbin_mode_boost +
- x->act_zbin_adj)) >> 7;
-
- x->block[24].zbin_extra = (short)zbin_extra;
-}
-
-void vp9_frame_init_quantizer(VP9_COMP *cpi) {
- // Clear Zbin mode boost for default case
- cpi->zbin_mode_boost = 0;
-
- // MB level quantizer setup
- vp9_mb_init_quantizer(cpi, &cpi->mb);
-}
-
-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
- VP9_COMMON *cm = &cpi->common;
-
- cm->base_qindex = Q;
-
- // if any of the delta_q values are changing update flag will
- // have to be set.
- cm->y1dc_delta_q = 0;
- cm->y2ac_delta_q = 0;
- cm->uvdc_delta_q = 0;
- cm->uvac_delta_q = 0;
- cm->y2dc_delta_q = 0;
-
- // quantizer has to be reinitialized if any delta_q changes.
- // As there are not any here for now this is inactive code.
- // if(update)
- // vp9_init_quantizer(cpi);
-}
--- a/vp8/encoder/quantize.h
+++ /dev/null
@@ -1,97 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __INC_QUANTIZE_H
-#define __INC_QUANTIZE_H
-
-#include "block.h"
-
-#define prototype_quantize_block(sym) \
- void (sym)(BLOCK *b,BLOCKD *d)
-
-#define prototype_quantize_block_pair(sym) \
- void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-
-#define prototype_quantize_mb(sym) \
- void (sym)(MACROBLOCK *x)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/quantize_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/quantize_arm.h"
-#endif
-
-#define prototype_quantize_block_type(sym) \
- void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
-extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
-
-#ifndef vp9_quantize_quantb_4x4
-#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_4x4);
-
-#ifndef vp9_quantize_quantb_4x4_pair
-#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair
-#endif
-extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);
-
-#ifndef vp9_quantize_quantb_8x8
-#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_8x8);
-
-#ifndef vp9_quantize_quantb_16x16
-#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-
-#ifndef vp9_quantize_quantb_2x2
-#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2
-#endif
-extern prototype_quantize_block(vp9_quantize_quantb_2x2);
-
-#ifndef vp9_quantize_mb_4x4
-#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mb_4x4);
-void vp9_quantize_mb_8x8(MACROBLOCK *x);
-
-#ifndef vp9_quantize_mbuv_4x4
-#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);
-
-#ifndef vp9_quantize_mby_4x4
-#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c
-#endif
-extern prototype_quantize_mb(vp9_quantize_mby_4x4);
-
-extern prototype_quantize_mb(vp9_quantize_mby_8x8);
-extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x);
-extern prototype_quantize_block(vp9_quantize_quantb_16x16);
-extern prototype_quantize_mb(vp9_quantize_mby_16x16);
-
-struct VP9_COMP;
-
-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
-
-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
-
-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
-
-extern void vp9_init_quantizer(struct VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/ratectrl.c
+++ /dev/null
@@ -1,698 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "math.h"
-#include "vp8/common/alloccommon.h"
-#include "vp8/common/common.h"
-#include "ratectrl.h"
-#include "vp8/common/entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/systemdependent.h"
-#include "encodemv.h"
-#include "vp8/common/quant_common.h"
-
-#define MIN_BPB_FACTOR 0.005
-#define MAX_BPB_FACTOR 50
-
-#ifdef MODE_STATS
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
-
-// Bits Per MB at different Q (Multiplied by 512)
-#define BPER_MB_NORMBITS 9
-
-// % adjustment to target kf size based on seperation from previous frame
-static const int kf_boost_seperation_adjustment[16] = {
- 30, 40, 50, 55, 60, 65, 70, 75,
- 80, 85, 90, 95, 100, 100, 100, 100,
-};
-
-static const int gf_adjust_table[101] = {
- 100,
- 115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
- 240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
- 350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-};
-
-static const int gf_intra_usage_adjustment[20] = {
- 125, 120, 115, 110, 105, 100, 95, 85, 80, 75,
- 70, 65, 60, 55, 50, 50, 50, 50, 50, 50,
-};
-
-static const int gf_interval_table[101] = {
- 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-};
-
-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
-
-// These functions use formulaic calculations to make playing with the
-// quantizer tables easier. If necessary they can be replaced by lookup
-// tables if and when things settle down in the experimental bitstream
-double vp9_convert_qindex_to_q(int qindex) {
- // Convert the index to a real Q value (scaled down to match old Q values)
- return (double)vp9_ac_yquant(qindex) / 4.0;
-}
-
-int vp9_gfboost_qadjust(int qindex) {
- int retval;
- double q;
-
- q = vp9_convert_qindex_to_q(qindex);
- retval = (int)((0.00000828 * q * q * q) +
- (-0.0055 * q * q) +
- (1.32 * q) + 79.3);
- return retval;
-}
-
-static int kfboost_qadjust(int qindex) {
- int retval;
- double q;
-
- q = vp9_convert_qindex_to_q(qindex);
- retval = (int)((0.00000973 * q * q * q) +
- (-0.00613 * q * q) +
- (1.316 * q) + 121.2);
- return retval;
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
- if (frame_type == KEY_FRAME)
- return (int)(4500000 / vp9_convert_qindex_to_q(qindex));
- else
- return (int)(2850000 / vp9_convert_qindex_to_q(qindex));
-}
-
-
-void vp9_save_coding_context(VP9_COMP *cpi) {
- CODING_CONTEXT *const cc = &cpi->coding_context;
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- // Stores a snapshot of key state variables which can subsequently be
- // restored with a call to vp9_restore_coding_context. These functions are
- // intended for use in a re-code loop in vp9_compress_frame where the
- // quantizer value is adjusted between loop iterations.
-
- cc->nmvc = cm->fc.nmvc;
- vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
- vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts);
- vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp);
-
- vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);
- vp9_copy(cc->mode_context, cm->fc.mode_context);
- vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);
- vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);
-
- vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
- vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
- vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
- vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
- vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
- vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
-
- // Stats
-#ifdef MODE_STATS
- vp9_copy(cc->y_modes, y_modes);
- vp9_copy(cc->uv_modes, uv_modes);
- vp9_copy(cc->b_modes, b_modes);
- vp9_copy(cc->inter_y_modes, inter_y_modes);
- vp9_copy(cc->inter_uv_modes, inter_uv_modes);
- vp9_copy(cc->inter_b_modes, inter_b_modes);
-#endif
-
- vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
- vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);
- vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);
- vp9_copy(cc->prob_comppred, cm->prob_comppred);
-
- vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
- cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));
-
- vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
- vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
-
- vp9_copy(cc->coef_probs, cm->fc.coef_probs);
- vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);
- vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
- vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
- vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
- vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
- vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-}
-
-void vp9_restore_coding_context(VP9_COMP *cpi) {
- CODING_CONTEXT *const cc = &cpi->coding_context;
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
- // Restore key state variables to the snapshot state stored in the
- // previous call to vp9_save_coding_context.
-
- cm->fc.nmvc = cc->nmvc;
- vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
- vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
- vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
-
- vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);
- vp9_copy(cm->fc.mode_context, cc->mode_context);
- vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);
- vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);
-
- vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
- vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
- vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
- vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
- vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
- vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
-
- // Stats
-#ifdef MODE_STATS
- vp9_copy(y_modes, cc->y_modes);
- vp9_copy(uv_modes, cc->uv_modes);
- vp9_copy(b_modes, cc->b_modes);
- vp9_copy(inter_y_modes, cc->inter_y_modes);
- vp9_copy(inter_uv_modes, cc->inter_uv_modes);
- vp9_copy(inter_b_modes, cc->inter_b_modes);
-#endif
-
- vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
- vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);
- vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);
- vp9_copy(cm->prob_comppred, cc->prob_comppred);
-
- vpx_memcpy(cm->last_frame_seg_map,
- cpi->coding_context.last_frame_seg_map_copy,
- (cm->mb_rows * cm->mb_cols));
-
- vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
- vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
-
- vp9_copy(cm->fc.coef_probs, cc->coef_probs);
- vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);
- vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
- vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
- vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
- vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
- vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-}
-
-
-void vp9_setup_key_frame(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
- // Setup for Key frame:
- vp9_default_coef_probs(& cpi->common);
- vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
- vp9_init_mbmode_probs(& cpi->common);
- vp9_default_bmode_probs(cm->fc.bmode_prob);
-
- vp9_init_mv_probs(& cpi->common);
-
- // cpi->common.filter_level = 0; // Reset every key frame.
- cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
-
- // interval before next GF
- cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-
- cpi->common.refresh_golden_frame = TRUE;
- cpi->common.refresh_alt_ref_frame = TRUE;
-
- vp9_init_mode_contexts(&cpi->common);
- vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
- vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-
- vpx_memset(cm->prev_mip, 0,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
- vpx_memset(cm->mip, 0,
- (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-
- vp9_update_mode_info_border(cm, cm->mip);
- vp9_update_mode_info_in_image(cm, cm->mi);
-}
-
-void vp9_setup_inter_frame(VP9_COMP *cpi) {
- if (cpi->common.refresh_alt_ref_frame) {
- vpx_memcpy(&cpi->common.fc,
- &cpi->common.lfc_a,
- sizeof(cpi->common.fc));
- vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
- cpi->common.fc.mode_context_a,
- sizeof(cpi->common.fc.vp8_mode_contexts));
- } else {
- vpx_memcpy(&cpi->common.fc,
- &cpi->common.lfc,
- sizeof(cpi->common.fc));
- vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
- cpi->common.fc.mode_context,
- sizeof(cpi->common.fc.vp8_mode_contexts));
- }
-}
-
-
-static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
- double correction_factor) {
- int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));
-
- /* Attempt to retain reasonable accuracy without overflow. The cutoff is
- * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
- * largest Bpm takes 20 bits.
- */
- if (MBs > (1 << 11))
- return (Bpm >> BPER_MB_NORMBITS) * MBs;
- else
- return (Bpm * MBs) >> BPER_MB_NORMBITS;
-}
-
-
-static void calc_iframe_target_size(VP9_COMP *cpi) {
- // boost defaults to half second
- int target;
-
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state(); // __asm emms;
-
- // New Two pass RC
- target = cpi->per_frame_bandwidth;
-
- if (cpi->oxcf.rc_max_intra_bitrate_pct) {
- unsigned int max_rate = cpi->per_frame_bandwidth
- * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
-
- if (target > max_rate)
- target = max_rate;
- }
-
- cpi->this_frame_target = target;
-
-}
-
-
-// Do the best we can to define the parameteres for the next GF based
-// on what information we have available.
-//
-// In this experimental code only two pass is supported
-// so we just use the interval determined in the two pass code.
-static void calc_gf_params(VP9_COMP *cpi) {
- // Set the gf interval
- cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-}
-
-
-static void calc_pframe_target_size(VP9_COMP *cpi) {
- int min_frame_target;
-
- min_frame_target = 0;
-
- min_frame_target = cpi->min_frame_bandwidth;
-
- if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
- min_frame_target = cpi->av_per_frame_bandwidth >> 5;
-
-
- // Special alt reference frame case
- if (cpi->common.refresh_alt_ref_frame) {
- // Per frame bit target for the alt ref frame
- cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
- cpi->this_frame_target = cpi->per_frame_bandwidth;
- }
-
- // Normal frames (gf,and inter)
- else {
- cpi->this_frame_target = cpi->per_frame_bandwidth;
- }
-
- // Sanity check that the total sum of adjustments is not above the maximum allowed
- // That is that having allowed for KF and GF penalties we have not pushed the
- // current interframe target to low. If the adjustment we apply here is not capable of recovering
- // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
- // a longer time span via other buffer / rate control mechanisms.
- if (cpi->this_frame_target < min_frame_target)
- cpi->this_frame_target = min_frame_target;
-
- if (!cpi->common.refresh_alt_ref_frame)
- // Note the baseline target data rate for this inter frame.
- cpi->inter_frame_target = cpi->this_frame_target;
-
- // Adjust target frame size for Golden Frames:
- if (cpi->frames_till_gf_update_due == 0) {
- // int Boost = 0;
- int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
-
- cpi->common.refresh_golden_frame = TRUE;
-
- calc_gf_params(cpi);
-
- // If we are using alternate ref instead of gf then do not apply the boost
- // It will instead be applied to the altref update
- // Jims modified boost
- if (!cpi->source_alt_ref_active) {
- if (cpi->oxcf.fixed_q < 0) {
- // The spend on the GF is defined in the two pass code
- // for two pass encodes
- cpi->this_frame_target = cpi->per_frame_bandwidth;
- } else
- cpi->this_frame_target =
- (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
- * cpi->last_boost) / 100;
-
- }
- // If there is an active ARF at this location use the minimum
- // bits on this frame even if it is a contructed arf.
- // The active maximum quantizer insures that an appropriate
- // number of bits will be spent if needed for contstructed ARFs.
- else {
- cpi->this_frame_target = 0;
- }
-
- cpi->current_gf_interval = cpi->frames_till_gf_update_due;
- }
-}
-
-
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
- int Q = cpi->common.base_qindex;
- int correction_factor = 100;
- double rate_correction_factor;
- double adjustment_limit;
-
- int projected_size_based_on_q = 0;
-
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state(); // __asm emms;
-
- if (cpi->common.frame_type == KEY_FRAME) {
- rate_correction_factor = cpi->key_frame_rate_correction_factor;
- } else {
- if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
- rate_correction_factor = cpi->gf_rate_correction_factor;
- else
- rate_correction_factor = cpi->rate_correction_factor;
- }
-
- // Work out how big we would have expected the frame to be at this Q given the current correction factor.
- // Stay in double to avoid int overflow when values are large
- projected_size_based_on_q =
- (int)(((.5 + rate_correction_factor *
- vp9_bits_per_mb(cpi->common.frame_type, Q)) *
- cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
-
- // Make some allowance for cpi->zbin_over_quant
- if (cpi->zbin_over_quant > 0) {
- int Z = cpi->zbin_over_quant;
- double Factor = 0.99;
- double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
- while (Z > 0) {
- Z--;
- projected_size_based_on_q =
- (int)(Factor * projected_size_based_on_q);
- Factor += factor_adjustment;
-
- if (Factor >= 0.999)
- Factor = 0.999;
- }
- }
-
- // Work out a size correction factor.
- // if ( cpi->this_frame_target > 0 )
- // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
- if (projected_size_based_on_q > 0)
- correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
-
- // More heavily damped adjustment used if we have been oscillating either side of target
- switch (damp_var) {
- case 0:
- adjustment_limit = 0.75;
- break;
- case 1:
- adjustment_limit = 0.375;
- break;
- case 2:
- default:
- adjustment_limit = 0.25;
- break;
- }
-
- // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
- if (correction_factor > 102) {
- // We are not already at the worst allowable quality
- correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
-
- // Keep rate_correction_factor within limits
- if (rate_correction_factor > MAX_BPB_FACTOR)
- rate_correction_factor = MAX_BPB_FACTOR;
- }
- // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
- else if (correction_factor < 99) {
- // We are not already at the best allowable quality
- correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
-
- // Keep rate_correction_factor within limits
- if (rate_correction_factor < MIN_BPB_FACTOR)
- rate_correction_factor = MIN_BPB_FACTOR;
- }
-
- if (cpi->common.frame_type == KEY_FRAME)
- cpi->key_frame_rate_correction_factor = rate_correction_factor;
- else {
- if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
- cpi->gf_rate_correction_factor = rate_correction_factor;
- else
- cpi->rate_correction_factor = rate_correction_factor;
- }
-}
-
-
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
- int Q = cpi->active_worst_quality;
-
- int i;
- int last_error = INT_MAX;
- int target_bits_per_mb;
- int bits_per_mb_at_this_q;
- double correction_factor;
-
- // Reset Zbin OQ value
- cpi->zbin_over_quant = 0;
-
- // Select the appropriate correction factor based upon type of frame.
- if (cpi->common.frame_type == KEY_FRAME)
- correction_factor = cpi->key_frame_rate_correction_factor;
- else {
- if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
- correction_factor = cpi->gf_rate_correction_factor;
- else
- correction_factor = cpi->rate_correction_factor;
- }
-
- // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
- if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
- target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int
- else
- target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
-
- i = cpi->active_best_quality;
-
- do {
- bits_per_mb_at_this_q =
- (int)(.5 + correction_factor *
- vp9_bits_per_mb(cpi->common.frame_type, i));
-
- if (bits_per_mb_at_this_q <= target_bits_per_mb) {
- if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
- Q = i;
- else
- Q = i - 1;
-
- break;
- } else
- last_error = bits_per_mb_at_this_q - target_bits_per_mb;
- } while (++i <= cpi->active_worst_quality);
-
-
- // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
- // the RD multiplier and zero bin size.
- if (Q >= MAXQ) {
- int zbin_oqmax;
-
- double Factor = 0.99;
- double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
-
- if (cpi->common.frame_type == KEY_FRAME)
- zbin_oqmax = 0; // ZBIN_OQ_MAX/16
- else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
- zbin_oqmax = 16;
- else
- zbin_oqmax = ZBIN_OQ_MAX;
-
- // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
- // The effect will be highly clip dependent and may well have sudden steps.
- // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
- // bin and hence decreasing the number of low magnitude non zero coefficients.
- while (cpi->zbin_over_quant < zbin_oqmax) {
- cpi->zbin_over_quant++;
-
- if (cpi->zbin_over_quant > zbin_oqmax)
- cpi->zbin_over_quant = zbin_oqmax;
-
- // Adjust bits_per_mb_at_this_q estimate
- bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
- Factor += factor_adjustment;
-
- if (Factor >= 0.999)
- Factor = 0.999;
-
- if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate
- break;
- }
-
- }
-
- return Q;
-}
-
-
-static int estimate_keyframe_frequency(VP9_COMP *cpi) {
- int i;
-
- // Average key frame frequency
- int av_key_frame_frequency = 0;
-
- /* First key frame at start of sequence is a special case. We have no
- * frequency data.
- */
- if (cpi->key_frame_count == 1) {
- /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
- * whichever is smaller.
- */
- int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
- av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
-
- if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
- av_key_frame_frequency = cpi->oxcf.key_freq;
-
- cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
- = av_key_frame_frequency;
- } else {
- unsigned int total_weight = 0;
- int last_kf_interval =
- (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
-
- /* reset keyframe context and calculate weighted average of last
- * KEY_FRAME_CONTEXT keyframes
- */
- for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
- if (i < KEY_FRAME_CONTEXT - 1)
- cpi->prior_key_frame_distance[i]
- = cpi->prior_key_frame_distance[i + 1];
- else
- cpi->prior_key_frame_distance[i] = last_kf_interval;
-
- av_key_frame_frequency += prior_key_frame_weight[i]
- * cpi->prior_key_frame_distance[i];
- total_weight += prior_key_frame_weight[i];
- }
-
- av_key_frame_frequency /= total_weight;
-
- }
- return av_key_frame_frequency;
-}
-
-
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state();
-
- cpi->frames_since_key = 0;
- cpi->key_frame_count++;
-}
-
-
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
- int *frame_over_shoot_limit) {
- // Set-up bounds on acceptable frame size:
- if (cpi->oxcf.fixed_q >= 0) {
- // Fixed Q scenario: frame size never outranges target (there is no target!)
- *frame_under_shoot_limit = 0;
- *frame_over_shoot_limit = INT_MAX;
- } else {
- if (cpi->common.frame_type == KEY_FRAME) {
- *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
- *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
- } else {
- if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {
- *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
- *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
- } else {
- // Stron overshoot limit for constrained quality
- if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
- *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
- } else {
- *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
- *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
- }
- }
- }
-
- // For very small rate targets where the fractional adjustment
- // (eg * 7/8) may be tiny make sure there is at least a minimum
- // range.
- *frame_over_shoot_limit += 200;
- *frame_under_shoot_limit -= 200;
- if (*frame_under_shoot_limit < 0)
- *frame_under_shoot_limit = 0;
- }
-}
-
-
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
- VP9_COMMON *cm = &cpi->common;
-
- if (cm->frame_type == KEY_FRAME)
- calc_iframe_target_size(cpi);
- else
- calc_pframe_target_size(cpi);
-
- return 1;
-}
--- a/vp8/encoder/ratectrl.h
+++ /dev/null
@@ -1,37 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined __INC_RATECTRL_H
-
-#include "onyx_int.h"
-
-#define FRAME_OVERHEAD_BITS 200
-
-extern void vp9_save_coding_context(VP9_COMP *cpi);
-extern void vp9_restore_coding_context(VP9_COMP *cpi);
-
-extern void vp9_setup_key_frame(VP9_COMP *cpi);
-extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
- int *frame_under_shoot_limit,
- int *frame_over_shoot_limit);
-
-// return of 0 means drop frame
-extern int vp9_pick_frame_size(VP9_COMP *cpi);
-
-extern double vp9_convert_qindex_to_q(int qindex);
-extern int vp9_gfboost_qadjust(int qindex);
-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
-
-#endif
--- a/vp8/encoder/rdopt.c
+++ /dev/null
@@ -1,4854 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-#include <assert.h>
-#include "vp8/common/pragmas.h"
-
-#include "tokenize.h"
-#include "treewriter.h"
-#include "onyx_int.h"
-#include "modecosts.h"
-#include "encodeintra.h"
-#include "vp8/common/entropymode.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/findnearmv.h"
-#include "vp8/common/quant_common.h"
-#include "encodemb.h"
-#include "quantize.h"
-#include "vp8/common/idct.h"
-#include "variance.h"
-#include "mcomp.h"
-#include "rdopt.h"
-#include "ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/systemdependent.h"
-#include "vp8/encoder/encodemv.h"
-
-#include "vp8/common/seg_common.h"
-#include "vp8/common/pred_common.h"
-#include "vp8/common/entropy.h"
-#include "vpx_rtcd.h"
-#if CONFIG_NEWBESTREFMV
-#include "vp8/common/mvref_common.h"
-#endif
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);
-extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);
-
-#define MAXF(a,b) (((a) > (b)) ? (a) : (b))
-
-#define INVALID_MV 0x80008000
-
-/* Factor to weigh the rate for switchable interp filters */
-#define SWITCHABLE_INTERP_RATE_FACTOR 1
-
-static const int auto_speed_thresh[17] = {
- 1000,
- 200,
- 150,
- 130,
- 150,
- 125,
- 120,
- 115,
- 115,
- 115,
- 115,
- 115,
- 115,
- 115,
- 115,
- 115,
- 105
-};
-
-#if CONFIG_PRED_FILTER
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {ZEROMV, LAST_FRAME, 0, 0},
- {ZEROMV, LAST_FRAME, 0, 1},
- {DC_PRED, INTRA_FRAME, 0, 0},
-
- {NEARESTMV, LAST_FRAME, 0, 0},
- {NEARESTMV, LAST_FRAME, 0, 1},
- {NEARMV, LAST_FRAME, 0, 0},
- {NEARMV, LAST_FRAME, 0, 1},
-
- {ZEROMV, GOLDEN_FRAME, 0, 0},
- {ZEROMV, GOLDEN_FRAME, 0, 1},
- {NEARESTMV, GOLDEN_FRAME, 0, 0},
- {NEARESTMV, GOLDEN_FRAME, 0, 1},
-
- {ZEROMV, ALTREF_FRAME, 0, 0},
- {ZEROMV, ALTREF_FRAME, 0, 1},
- {NEARESTMV, ALTREF_FRAME, 0, 0},
- {NEARESTMV, ALTREF_FRAME, 0, 1},
-
- {NEARMV, GOLDEN_FRAME, 0, 0},
- {NEARMV, GOLDEN_FRAME, 0, 1},
- {NEARMV, ALTREF_FRAME, 0, 0},
- {NEARMV, ALTREF_FRAME, 0, 1},
-
- {V_PRED, INTRA_FRAME, 0, 0},
- {H_PRED, INTRA_FRAME, 0, 0},
- {D45_PRED, INTRA_FRAME, 0, 0},
- {D135_PRED, INTRA_FRAME, 0, 0},
- {D117_PRED, INTRA_FRAME, 0, 0},
- {D153_PRED, INTRA_FRAME, 0, 0},
- {D27_PRED, INTRA_FRAME, 0, 0},
- {D63_PRED, INTRA_FRAME, 0, 0},
-
- {TM_PRED, INTRA_FRAME, 0, 0},
-
- {NEWMV, LAST_FRAME, 0, 0},
- {NEWMV, LAST_FRAME, 0, 1},
- {NEWMV, GOLDEN_FRAME, 0, 0},
- {NEWMV, GOLDEN_FRAME, 0, 1},
- {NEWMV, ALTREF_FRAME, 0, 0},
- {NEWMV, ALTREF_FRAME, 0, 1},
-
- {SPLITMV, LAST_FRAME, 0, 0},
- {SPLITMV, GOLDEN_FRAME, 0, 0},
- {SPLITMV, ALTREF_FRAME, 0, 0},
-
- {B_PRED, INTRA_FRAME, 0, 0},
- {I8X8_PRED, INTRA_FRAME, 0, 0},
-
- /* compound prediction modes */
- {ZEROMV, LAST_FRAME, GOLDEN_FRAME, 0},
- {NEARESTMV, LAST_FRAME, GOLDEN_FRAME, 0},
- {NEARMV, LAST_FRAME, GOLDEN_FRAME, 0},
-
- {ZEROMV, ALTREF_FRAME, LAST_FRAME, 0},
- {NEARESTMV, ALTREF_FRAME, LAST_FRAME, 0},
- {NEARMV, ALTREF_FRAME, LAST_FRAME, 0},
-
- {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
- {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
- {NEARMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
-
- {NEWMV, LAST_FRAME, GOLDEN_FRAME, 0},
- {NEWMV, ALTREF_FRAME, LAST_FRAME, 0},
- {NEWMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
-
- {SPLITMV, LAST_FRAME, GOLDEN_FRAME, 0},
- {SPLITMV, ALTREF_FRAME, LAST_FRAME, 0},
- {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME, 0}
-};
-#else
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {ZEROMV, LAST_FRAME, 0},
- {DC_PRED, INTRA_FRAME, 0},
-
- {NEARESTMV, LAST_FRAME, 0},
- {NEARMV, LAST_FRAME, 0},
-
- {ZEROMV, GOLDEN_FRAME, 0},
- {NEARESTMV, GOLDEN_FRAME, 0},
-
- {ZEROMV, ALTREF_FRAME, 0},
- {NEARESTMV, ALTREF_FRAME, 0},
-
- {NEARMV, GOLDEN_FRAME, 0},
- {NEARMV, ALTREF_FRAME, 0},
-
- {V_PRED, INTRA_FRAME, 0},
- {H_PRED, INTRA_FRAME, 0},
- {D45_PRED, INTRA_FRAME, 0},
- {D135_PRED, INTRA_FRAME, 0},
- {D117_PRED, INTRA_FRAME, 0},
- {D153_PRED, INTRA_FRAME, 0},
- {D27_PRED, INTRA_FRAME, 0},
- {D63_PRED, INTRA_FRAME, 0},
-
- {TM_PRED, INTRA_FRAME, 0},
-
- {NEWMV, LAST_FRAME, 0},
- {NEWMV, GOLDEN_FRAME, 0},
- {NEWMV, ALTREF_FRAME, 0},
-
- {SPLITMV, LAST_FRAME, 0},
- {SPLITMV, GOLDEN_FRAME, 0},
- {SPLITMV, ALTREF_FRAME, 0},
-
- {B_PRED, INTRA_FRAME, 0},
- {I8X8_PRED, INTRA_FRAME, 0},
-
- /* compound prediction modes */
- {ZEROMV, LAST_FRAME, GOLDEN_FRAME},
- {NEARESTMV, LAST_FRAME, GOLDEN_FRAME},
- {NEARMV, LAST_FRAME, GOLDEN_FRAME},
-
- {ZEROMV, ALTREF_FRAME, LAST_FRAME},
- {NEARESTMV, ALTREF_FRAME, LAST_FRAME},
- {NEARMV, ALTREF_FRAME, LAST_FRAME},
-
- {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {NEWMV, LAST_FRAME, GOLDEN_FRAME},
- {NEWMV, ALTREF_FRAME, LAST_FRAME },
- {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {SPLITMV, LAST_FRAME, GOLDEN_FRAME},
- {SPLITMV, ALTREF_FRAME, LAST_FRAME },
- {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}
-};
-#endif
-
-static void fill_token_costs(
- unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
- const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],
- int block_type_counts) {
- int i, j, k;
-
- for (i = 0; i < block_type_counts; i++)
- for (j = 0; j < COEF_BANDS; j++)
- for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
- if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))
- vp9_cost_tokens_skip((int *)(c[i][j][k]),
- p[i][j][k],
- vp9_coef_tree);
- else
- vp9_cost_tokens((int *)(c[i][j][k]),
- p[i][j][k],
- vp9_coef_tree);
- }
-}
-
-
-static int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, };
-
-// 3* dc_qlookup[Q]*dc_qlookup[Q];
-
-/* values are now correlated to quantizer */
-static int sad_per_bit16lut[QINDEX_RANGE];
-static int sad_per_bit4lut[QINDEX_RANGE];
-
-void vp9_init_me_luts() {
- int i;
-
- // Initialize the sad lut tables using a formulaic calculation for now
- // This is to make it easier to resolve the impact of experimental changes
- // to the quantizer tables.
- for (i = 0; i < QINDEX_RANGE; i++) {
- sad_per_bit16lut[i] =
- (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
- sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
- }
-}
-
-static int compute_rd_mult(int qindex) {
- int q;
-
- q = vp9_dc_quant(qindex, 0);
- return (11 * q * q) >> 6;
-}
-
-void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {
- cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex];
- cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex];
-}
-
-
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
- int q, i;
-
- vp9_clear_system_state(); // __asm emms;
-
- // Further tests required to see if optimum is different
- // for key frames, golden frames and arf frames.
- // if (cpi->common.refresh_golden_frame ||
- // cpi->common.refresh_alt_ref_frame)
- QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);
-
- cpi->RDMULT = compute_rd_mult(QIndex);
-
- // Extend rate multiplier along side quantizer zbin increases
- if (cpi->zbin_over_quant > 0) {
- double oq_factor;
-
- // Experimental code using the same basic equation as used for Q above
- // The units of cpi->zbin_over_quant are 1/128 of Q bin size
- oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
- cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);
- }
-
- if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
- if (cpi->twopass.next_iiratio > 31)
- cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
- else
- cpi->RDMULT +=
- (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
- }
-
- if (cpi->RDMULT < 7)
- cpi->RDMULT = 7;
-
- cpi->mb.errorperbit = (cpi->RDMULT / 110);
- cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
- vp9_set_speed_features(cpi);
-
- q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);
- q = q << 2;
- cpi->RDMULT = cpi->RDMULT << 4;
-
- if (q < 8)
- q = 8;
-
- if (cpi->RDMULT > 1000) {
- cpi->RDDIV = 1;
- cpi->RDMULT /= 100;
-
- for (i = 0; i < MAX_MODES; i++) {
- if (cpi->sf.thresh_mult[i] < INT_MAX) {
- cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
- } else {
- cpi->rd_threshes[i] = INT_MAX;
- }
-
- cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
- }
- } else {
- cpi->RDDIV = 100;
-
- for (i = 0; i < MAX_MODES; i++) {
- if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {
- cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
- } else {
- cpi->rd_threshes[i] = INT_MAX;
- }
-
- cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
- }
- }
-
- fill_token_costs(
- cpi->mb.token_costs[TX_4X4],
- (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
- BLOCK_TYPES);
- fill_token_costs(
- cpi->mb.hybrid_token_costs[TX_4X4],
- (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
- cpi->common.fc.hybrid_coef_probs,
- BLOCK_TYPES);
-
- fill_token_costs(
- cpi->mb.token_costs[TX_8X8],
- (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
- BLOCK_TYPES_8X8);
- fill_token_costs(
- cpi->mb.hybrid_token_costs[TX_8X8],
- (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
- cpi->common.fc.hybrid_coef_probs_8x8,
- BLOCK_TYPES_8X8);
-
- fill_token_costs(
- cpi->mb.token_costs[TX_16X16],
- (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
- BLOCK_TYPES_16X16);
- fill_token_costs(
- cpi->mb.hybrid_token_costs[TX_16X16],
- (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])
- cpi->common.fc.hybrid_coef_probs_16x16,
- BLOCK_TYPES_16X16);
-
- /*rough estimate for costing*/
- cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
- vp9_init_mode_costs(cpi);
-
- if (cpi->common.frame_type != KEY_FRAME)
- {
- vp9_build_nmv_cost_table(
- cpi->mb.nmvjointcost,
- cpi->mb.e_mbd.allow_high_precision_mv ?
- cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
- &cpi->common.fc.nmvc,
- cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
- }
-}
-
-void vp9_auto_select_speed(VP9_COMP *cpi) {
- int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
-
- milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
-
- /*
- // this is done during parameter valid check
- if( cpi->oxcf.cpu_used > 16)
- cpi->oxcf.cpu_used = 16;
- if( cpi->oxcf.cpu_used < -16)
- cpi->oxcf.cpu_used = -16;
- */
-
- if (cpi->avg_pick_mode_time < milliseconds_for_compress &&
- (cpi->avg_encode_time - cpi->avg_pick_mode_time) <
- milliseconds_for_compress) {
- if (cpi->avg_pick_mode_time == 0) {
- cpi->Speed = 4;
- } else {
- if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {
- cpi->Speed += 2;
- cpi->avg_pick_mode_time = 0;
- cpi->avg_encode_time = 0;
-
- if (cpi->Speed > 16) {
- cpi->Speed = 16;
- }
- }
-
- if (milliseconds_for_compress * 100 >
- cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {
- cpi->Speed -= 1;
- cpi->avg_pick_mode_time = 0;
- cpi->avg_encode_time = 0;
-
- // In real-time mode, cpi->speed is in [4, 16].
- if (cpi->Speed < 4) { // if ( cpi->Speed < 0 )
- cpi->Speed = 4; // cpi->Speed = 0;
- }
- }
- }
- } else {
- cpi->Speed += 4;
-
- if (cpi->Speed > 16)
- cpi->Speed = 16;
-
-
- cpi->avg_pick_mode_time = 0;
- cpi->avg_encode_time = 0;
- }
-}
-
-int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {
- int i, error = 0;
-
- for (i = 0; i < block_size; i++) {
- int this_diff = coeff[i] - dqcoeff[i];
- error += this_diff * this_diff;
- }
-
- return error;
-}
-
-int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
- BLOCK *be;
- BLOCKD *bd;
- int i, j;
- int berror, error = 0;
-
- for (i = 0; i < 16; i++) {
- be = &mb->block[i];
- bd = &mb->e_mbd.block[i];
-
- berror = 0;
-
- for (j = dc; j < 16; j++) {
- int this_diff = be->coeff[j] - bd->dqcoeff[j];
- berror += this_diff * this_diff;
- }
-
- error += berror;
- }
-
- return error;
-}
-
-int vp9_mbuverror_c(MACROBLOCK *mb) {
- BLOCK *be;
- BLOCKD *bd;
-
- int i, error = 0;
-
- for (i = 16; i < 24; i++) {
- be = &mb->block[i];
- bd = &mb->e_mbd.block[i];
-
- error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
- }
-
- return error;
-}
-
-int vp9_uvsse(MACROBLOCK *x) {
- unsigned char *uptr, *vptr;
- unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
- unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
- int uv_stride = x->block[16].src_stride;
-
- unsigned int sse1 = 0;
- unsigned int sse2 = 0;
- int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;
- int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;
- int offset;
- int pre_stride = x->e_mbd.block[16].pre_stride;
-
- if (mv_row < 0)
- mv_row -= 1;
- else
- mv_row += 1;
-
- if (mv_col < 0)
- mv_col -= 1;
- else
- mv_col += 1;
-
- mv_row /= 2;
- mv_col /= 2;
-
- offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
- uptr = x->e_mbd.pre.u_buffer + offset;
- vptr = x->e_mbd.pre.v_buffer + offset;
-
- if ((mv_row | mv_col) & 7) {
- vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
- (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
- vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
- (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
- sse2 += sse1;
- } else {
- vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
- vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
- sse2 += sse1;
- }
- return sse2;
-
-}
-
-static int cost_coeffs_2x2(MACROBLOCK *mb,
- BLOCKD *b, PLANE_TYPE type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
- int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
- int eob = b->eob;
- int pt; /* surrounding block/prev coef predictor */
- int cost = 0;
- short *qcoeff_ptr = b->qcoeff;
-
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- assert(eob <= 4);
-
- for (; c < eob; c++) {
- int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];
- int t = vp9_dct_value_tokens_ptr[v].Token;
- cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];
- cost += vp9_dct_value_cost_ptr[v];
- pt = vp9_prev_token_class[t];
- }
-
- if (c < 4)
- cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]
- [pt] [DCT_EOB_TOKEN];
-
- pt = (c != !type); // is eob first coefficient;
- *a = *l = pt;
- return cost;
-}
-
-static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int tx_size) {
- const int eob = b->eob;
- int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
- int cost = 0, default_eob, seg_eob;
- int pt; /* surrounding block/prev coef predictor */
- int const *scan, *band;
- short *qcoeff_ptr = b->qcoeff;
- MACROBLOCKD *xd = &mb->e_mbd;
- MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
- TX_TYPE tx_type = DCT_DCT;
- int segment_id = mbmi->segment_id;
-
- switch (tx_size) {
- case TX_4X4:
- scan = vp9_default_zig_zag1d;
- band = vp9_coef_bands;
- default_eob = 16;
- if (type == PLANE_TYPE_Y_WITH_DC) {
- tx_type = get_tx_type_4x4(xd, b);
- if (tx_type != DCT_DCT) {
- switch (tx_type) {
- case ADST_DCT:
- scan = vp9_row_scan;
- break;
-
- case DCT_ADST:
- scan = vp9_col_scan;
- break;
-
- default:
- scan = vp9_default_zig_zag1d;
- break;
- }
- }
- }
-
- break;
- case TX_8X8:
- scan = vp9_default_zig_zag1d_8x8;
- band = vp9_coef_bands_8x8;
- default_eob = 64;
- if (type == PLANE_TYPE_Y_WITH_DC) {
- BLOCKD *bb;
- int ib = (b - xd->block);
- if (ib < 16) {
- ib = (ib & 8) + ((ib & 4) >> 1);
- bb = xd->block + ib;
- tx_type = get_tx_type_8x8(xd, bb);
- }
- }
- break;
- case TX_16X16:
- scan = vp9_default_zig_zag1d_16x16;
- band = vp9_coef_bands_16x16;
- default_eob = 256;
- if (type == PLANE_TYPE_Y_WITH_DC) {
- tx_type = get_tx_type_16x16(xd, b);
- }
- break;
- default:
- break;
- }
- if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
- seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);
- else
- seg_eob = default_eob;
-
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
- if (tx_type != DCT_DCT) {
- for (; c < eob; c++) {
- int v = qcoeff_ptr[scan[c]];
- int t = vp9_dct_value_tokens_ptr[v].Token;
- cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];
- cost += vp9_dct_value_cost_ptr[v];
- pt = vp9_prev_token_class[t];
- }
- if (c < seg_eob)
- cost += mb->hybrid_token_costs[tx_size][type][band[c]]
- [pt][DCT_EOB_TOKEN];
- } else {
- for (; c < eob; c++) {
- int v = qcoeff_ptr[scan[c]];
- int t = vp9_dct_value_tokens_ptr[v].Token;
- cost += mb->token_costs[tx_size][type][band[c]][pt][t];
- cost += vp9_dct_value_cost_ptr[v];
- pt = vp9_prev_token_class[t];
- }
- if (c < seg_eob)
- cost += mb->token_costs[tx_size][type][band[c]]
- [pt][DCT_EOB_TOKEN];
- }
-
- pt = (c != !type); // is eob first coefficient;
- *a = *l = pt;
- return cost;
-}
-
-static int rdcost_mby_4x4(MACROBLOCK *mb) {
- int cost = 0;
- int b;
- MACROBLOCKD *xd = &mb->e_mbd;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
-
- vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- for (b = 0; b < 16; b++)
- cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
- ta + vp9_block2above[b], tl + vp9_block2left[b],
- TX_4X4);
-
- cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
- ta + vp9_block2above[24], tl + vp9_block2left[24],
- TX_4X4);
-
- return cost;
-}
-
-static void macro_block_yrd_4x4(MACROBLOCK *mb,
- int *Rate,
- int *Distortion,
- const VP9_ENCODER_RTCD *rtcd,
- int *skippable) {
- int b;
- MACROBLOCKD *const xd = &mb->e_mbd;
- BLOCK *const mb_y2 = mb->block + 24;
- BLOCKD *const x_y2 = xd->block + 24;
- short *Y2DCPtr = mb_y2->src_diff;
- BLOCK *beptr;
- int d;
-
- vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
- mb->block[0].src_stride);
-
- // Fdct and building the 2nd order block
- for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {
- mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
- *Y2DCPtr++ = beptr->coeff[0];
- *Y2DCPtr++ = beptr->coeff[16];
- }
-
- // 2nd order fdct
- mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
-
- // Quantization
- for (b = 0; b < 16; b++) {
- mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);
- }
-
- // DC predication and Quantization of 2nd Order block
- mb->quantize_b_4x4(mb_y2, x_y2);
-
- // Distortion
- d = vp9_mbblock_error(mb, 1);
-
- d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
- *Distortion = (d >> 2);
- // rate
- *Rate = rdcost_mby_4x4(mb);
- *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
-}
-
-static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
- int cost = 0;
- int b;
- MACROBLOCKD *xd = &mb->e_mbd;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta;
- ENTROPY_CONTEXT *tl;
-
- if (backup) {
- vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- } else {
- ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
- tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
- }
-
- for (b = 0; b < 16; b += 4)
- cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
- ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
- TX_8X8);
-
- cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,
- ta + vp9_block2above[24], tl + vp9_block2left[24]);
- return cost;
-}
-
-static void macro_block_yrd_8x8(MACROBLOCK *mb,
- int *Rate,
- int *Distortion,
- const VP9_ENCODER_RTCD *rtcd,
- int *skippable) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- BLOCK *const mb_y2 = mb->block + 24;
- BLOCKD *const x_y2 = xd->block + 24;
- int d;
-
- vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
- mb->block[0].src_stride);
-
- vp9_transform_mby_8x8(mb);
- vp9_quantize_mby_8x8(mb);
-
- /* remove 1st order dc to properly combine 1st/2nd order distortion */
- mb->coeff[0] = 0;
- mb->coeff[64] = 0;
- mb->coeff[128] = 0;
- mb->coeff[192] = 0;
- xd->dqcoeff[0] = 0;
- xd->dqcoeff[64] = 0;
- xd->dqcoeff[128] = 0;
- xd->dqcoeff[192] = 0;
-
- d = vp9_mbblock_error(mb, 0);
- d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
-
- *Distortion = (d >> 2);
- // rate
- *Rate = rdcost_mby_8x8(mb, 1);
- *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
-}
-
-static int rdcost_mby_16x16(MACROBLOCK *mb) {
- int cost;
- MACROBLOCKD *xd = &mb->e_mbd;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
-
- vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
- return cost;
-}
-
-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
- const VP9_ENCODER_RTCD *rtcd, int *skippable) {
- int d;
- MACROBLOCKD *xd = &mb->e_mbd;
- BLOCKD *b = &mb->e_mbd.block[0];
- BLOCK *be = &mb->block[0];
- TX_TYPE tx_type;
-
- vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,
- mb->block[0].src_stride);
-
- tx_type = get_tx_type_16x16(xd, b);
- if (tx_type != DCT_DCT) {
- vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);
- } else
- vp9_transform_mby_16x16(mb);
-
- vp9_quantize_mby_16x16(mb);
- // TODO(jingning) is it possible to quickly determine whether to force
- // trailing coefficients to be zero, instead of running trellis
- // optimization in the rate-distortion optimization loop?
- if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
- vp9_optimize_mby_16x16(mb, rtcd);
-
- d = vp9_mbblock_error(mb, 0);
-
- *Distortion = (d >> 2);
- // rate
- *Rate = rdcost_mby_16x16(mb);
- *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
-}
-
-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int *distortion, int *skippable,
- int64_t txfm_cache[NB_TXFM_MODES]) {
- VP9_COMMON *cm = &cpi->common;
- MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
-
- MACROBLOCKD *xd = &x->e_mbd;
- int can_skip = cm->mb_no_coeff_skip;
- vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
- int s0, s1;
- int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
- int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
- int d16x16, r16x16, r16x16s, s16x16;
- int64_t rd16x16, rd16x16s;
-
- // FIXME don't do sub x3
- if (skip_prob == 0)
- skip_prob = 1;
- s0 = vp9_cost_bit(skip_prob, 0);
- s1 = vp9_cost_bit(skip_prob, 1);
- macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
- if (can_skip) {
- if (s16x16) {
- rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
- } else {
- rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
- }
- } else {
- rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
- }
- r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
- if (can_skip) {
- if (s16x16) {
- rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
- } else {
- rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
- }
- } else {
- rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
- }
- macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
- if (can_skip) {
- if (s8x8) {
- rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
- } else {
- rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
- }
- } else {
- rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
- }
- r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
- r8x8s += vp9_cost_zero(cm->prob_tx[1]);
- if (can_skip) {
- if (s8x8) {
- rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
- } else {
- rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
- }
- } else {
- rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
- }
- macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
- if (can_skip) {
- if (s4x4) {
- rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
- } else {
- rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
- }
- } else {
- rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
- }
- r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
- if (can_skip) {
- if (s4x4) {
- rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
- } else {
- rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
- }
- } else {
- rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
- }
-
- if ( cpi->common.txfm_mode == ALLOW_16X16 ||
- (cpi->common.txfm_mode == TX_MODE_SELECT &&
- rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
- mbmi->txfm_size = TX_16X16;
- *skippable = s16x16;
- *distortion = d16x16;
- *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
- } else
- if ( cpi->common.txfm_mode == ALLOW_8X8 ||
- (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
- mbmi->txfm_size = TX_8X8;
- *skippable = s8x8;
- *distortion = d8x8;
- *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
- } else {
- assert(cpi->common.txfm_mode == ONLY_4X4 ||
- (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
- mbmi->txfm_size = TX_4X4;
- *skippable = s4x4;
- *distortion = d4x4;
- *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
- }
-
- txfm_cache[ONLY_4X4] = rd4x4;
- txfm_cache[ALLOW_8X8] = rd8x8;
- txfm_cache[ALLOW_16X16] = rd16x16;
- if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
- txfm_cache[TX_MODE_SELECT] = rd16x16s;
- else
- txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
-
-}
-
-static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
- const unsigned int *p = (const unsigned int *)predictor;
- unsigned int *d = (unsigned int *)dst;
- d[0] = p[0];
- d[4] = p[4];
- d[8] = p[8];
- d[12] = p[12];
-}
-
-#if CONFIG_SUPERBLOCKS
-static void super_block_yrd_8x8(MACROBLOCK *x,
- int *rate,
- int *distortion,
- const VP9_ENCODER_RTCD *rtcd, int *skip)
-{
- MACROBLOCKD *const xd = &x->e_mbd;
- BLOCK *const by2 = x->block + 24;
- BLOCKD *const bdy2 = xd->block + 24;
- int d = 0, r = 0, n;
- const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
- int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
- ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
- ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
- ENTROPY_CONTEXT_PLANES t_above[2];
- ENTROPY_CONTEXT_PLANES t_left[2];
- int skippable = 1;
-
- vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
- vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
-
- for (n = 0; n < 4; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
-
- vp9_subtract_mby_s_c(x->src_diff,
- src + x_idx * 16 + y_idx * 16 * src_y_stride,
- src_y_stride,
- dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
- dst_y_stride);
- vp9_transform_mby_8x8(x);
- vp9_quantize_mby_8x8(x);
-
- /* remove 1st order dc to properly combine 1st/2nd order distortion */
- x->coeff[ 0] = 0;
- x->coeff[ 64] = 0;
- x->coeff[128] = 0;
- x->coeff[192] = 0;
- xd->dqcoeff[ 0] = 0;
- xd->dqcoeff[ 64] = 0;
- xd->dqcoeff[128] = 0;
- xd->dqcoeff[192] = 0;
-
- d += vp9_mbblock_error(x, 0);
- d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
- xd->above_context = ta + x_idx;
- xd->left_context = tl + y_idx;
- r += rdcost_mby_8x8(x, 0);
- skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
- }
-
- *distortion = (d >> 2);
- *rate = r;
- if (skip) *skip = skippable;
- xd->above_context = ta;
- xd->left_context = tl;
- vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
- vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
-}
-#endif
-
-static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
- const unsigned int *p = (const unsigned int *)predictor;
- unsigned int *d = (unsigned int *)dst;
- d[0] = p[0];
- d[1] = p[1];
- d[4] = p[4];
- d[5] = p[5];
- d[8] = p[8];
- d[9] = p[9];
- d[12] = p[12];
- d[13] = p[13];
- d[16] = p[16];
- d[17] = p[17];
- d[20] = p[20];
- d[21] = p[21];
- d[24] = p[24];
- d[25] = p[25];
- d[28] = p[28];
- d[29] = p[29];
-}
-
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
- BLOCKD *b, B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE *best_second_mode,
- int allow_comp,
-#endif
- int *bmode_costs,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int *bestrate, int *bestratey,
- int *bestdistortion) {
- B_PREDICTION_MODE mode;
- MACROBLOCKD *xd = &x->e_mbd;
-
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE mode2;
-#endif
- int64_t best_rd = INT64_MAX;
- int rate = 0;
- int distortion;
-
- ENTROPY_CONTEXT ta = *a, tempa = *a;
- ENTROPY_CONTEXT tl = *l, templ = *l;
- TX_TYPE tx_type = DCT_DCT;
- TX_TYPE best_tx_type = DCT_DCT;
- /*
- * The predictor buffer is a 2d buffer with a stride of 16. Create
- * a temp buffer that meets the stride requirements, but we are only
- * interested in the left 4x4 block
- * */
- DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16 * 4);
- DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
-
- for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
- for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
- mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
-#endif
- int64_t this_rd;
- int ratey;
-
- b->bmi.as_mode.first = mode;
- rate = bmode_costs[mode];
-
-#if CONFIG_COMP_INTRA_PRED
- if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
- vp9_intra4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);
- rate += bmode_costs[mode2];
- }
-#endif
- vp9_subtract_b(be, b, 16);
-
- b->bmi.as_mode.first = mode;
- tx_type = get_tx_type_4x4(xd, b);
- if (tx_type != DCT_DCT) {
- vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
- vp9_ht_quantize_b_4x4(be, b, tx_type);
- } else {
- x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
- x->quantize_b_4x4(be, b);
- }
-
- tempa = ta;
- templ = tl;
-
- ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
- rate += ratey;
- distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
-
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
- if (this_rd < best_rd) {
- *bestrate = rate;
- *bestratey = ratey;
- *bestdistortion = distortion;
- best_rd = this_rd;
- *best_mode = mode;
- best_tx_type = tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
- *best_second_mode = mode2;
-#endif
- *a = tempa;
- *l = templ;
- copy_predictor(best_predictor, b->predictor);
- vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
- }
-#if CONFIG_COMP_INTRA_PRED
- }
-#endif
- }
- b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
-#if CONFIG_COMP_INTRA_PRED
- b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
-#endif
-
- // inverse transform
- if (best_tx_type != DCT_DCT)
- vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
- else
- IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
- best_dqcoeff, b->diff, 32);
-
- vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
- return best_rd;
-}
-
-static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,
- int *rate_y, int *Distortion, int64_t best_rd,
-#if CONFIG_COMP_INTRA_PRED
- int allow_comp,
-#endif
- int update_contexts) {
- int i;
- MACROBLOCKD *const xd = &mb->e_mbd;
- int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
- int distortion = 0;
- int tot_rate_y = 0;
- int64_t total_rd = 0;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
- int *bmode_costs;
-
- if (update_contexts) {
- ta = (ENTROPY_CONTEXT *)xd->above_context;
- tl = (ENTROPY_CONTEXT *)xd->left_context;
- } else {
- vpx_memcpy(&t_above, xd->above_context,
- sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context,
- sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- }
-
- xd->mode_info_context->mbmi.mode = B_PRED;
- bmode_costs = mb->inter_bmode_costs;
-
- for (i = 0; i < 16; i++) {
- MODE_INFO *const mic = xd->mode_info_context;
- const int mis = xd->mode_info_stride;
- B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
- int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
- if (xd->frame_type == KEY_FRAME) {
- const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
- const B_PREDICTION_MODE L = left_block_mode(mic, i);
-
- bmode_costs = mb->bmode_costs[A][L];
- }
-
- total_rd += rd_pick_intra4x4block(
- cpi, mb, mb->block + i, xd->block + i, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
- & best_second_mode, allow_comp,
-#endif
- bmode_costs, ta + vp9_block2above[i],
- tl + vp9_block2left[i], &r, &ry, &d);
-
- cost += r;
- distortion += d;
- tot_rate_y += ry;
-
- mic->bmi[i].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
- mic->bmi[i].as_mode.second = best_second_mode;
-#endif
-
- if (total_rd >= best_rd)
- break;
- }
-
- if (total_rd >= best_rd)
- return INT64_MAX;
-
-#if CONFIG_COMP_INTRA_PRED
- cost += vp9_cost_bit(128, allow_comp);
-#endif
- *Rate = cost;
- *rate_y += tot_rate_y;
- *Distortion = distortion;
-
- return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-#if CONFIG_SUPERBLOCKS
-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
- MACROBLOCK *x,
- int *rate,
- int *rate_tokenonly,
- int *distortion,
- int *skippable) {
- MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
- int this_rate, this_rate_tokenonly;
- int this_distortion, s;
- int64_t best_rd = INT64_MAX, this_rd;
-
- /* Y Search for 32x32 intra prediction mode */
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
- x->e_mbd.mode_info_context->mbmi.mode = mode;
- vp9_build_intra_predictors_sby_s(&x->e_mbd);
-
- super_block_yrd_8x8(x, &this_rate_tokenonly,
- &this_distortion, IF_RTCD(&cpi->rtcd), &s);
- this_rate = this_rate_tokenonly +
- x->mbmode_cost[x->e_mbd.frame_type]
- [x->e_mbd.mode_info_context->mbmi.mode];
- this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
- if (this_rd < best_rd) {
- mode_selected = mode;
- best_rd = this_rd;
- *rate = this_rate;
- *rate_tokenonly = this_rate_tokenonly;
- *distortion = this_distortion;
- *skippable = s;
- }
- }
-
- x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-
- return best_rd;
-}
-#endif
-
-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
- MACROBLOCK *x,
- int *Rate,
- int *rate_y,
- int *Distortion,
- int *skippable,
- int64_t txfm_cache[NB_TXFM_MODES]) {
- MB_PREDICTION_MODE mode;
- TX_SIZE txfm_size;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
- MB_PREDICTION_MODE mode2;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- int rate, ratey;
- int distortion, skip;
- int64_t best_rd = INT64_MAX;
- int64_t this_rd;
- MACROBLOCKD *xd = &x->e_mbd;
-
- int i;
- for (i = 0; i < NB_TXFM_MODES; i++)
- txfm_cache[i] = INT64_MAX;
-
- // Y Search for 16x16 intra prediction mode
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
- int64_t local_txfm_cache[NB_TXFM_MODES];
-
- mbmi->mode = mode;
-
-#if CONFIG_COMP_INTRA_PRED
- for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
- mbmi->second_mode = mode2;
- if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
- vp9_build_intra_predictors_mby(&x->e_mbd);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- continue; // i.e. disable for now
- vp9_build_comp_intra_predictors_mby(&x->e_mbd);
- }
-#endif
-
- macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
-
- // FIXME add compoundmode cost
- // FIXME add rate for mode2
- rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
-
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-
- if (this_rd < best_rd) {
- mode_selected = mode;
- txfm_size = mbmi->txfm_size;
-#if CONFIG_COMP_INTRA_PRED
- mode2_selected = mode2;
-#endif
- best_rd = this_rd;
- *Rate = rate;
- *rate_y = ratey;
- *Distortion = distortion;
- *skippable = skip;
- }
-
- for (i = 0; i < NB_TXFM_MODES; i++) {
- int64_t adj_rd = this_rd + local_txfm_cache[i] -
- local_txfm_cache[cpi->common.txfm_mode];
- if (adj_rd < txfm_cache[i]) {
- txfm_cache[i] = adj_rd;
- }
- }
-
-#if CONFIG_COMP_INTRA_PRED
- }
-#endif
- }
-
- mbmi->txfm_size = txfm_size;
- mbmi->mode = mode_selected;
-
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_mode = mode2_selected;
-#endif
- return best_rd;
-}
-
-
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
- B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE *best_second_mode,
-#endif
- int *mode_costs,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
- int *bestrate, int *bestratey,
- int *bestdistortion) {
- MB_PREDICTION_MODE mode;
-#if CONFIG_COMP_INTRA_PRED
- MB_PREDICTION_MODE mode2;
-#endif
- MACROBLOCKD *xd = &x->e_mbd;
- int64_t best_rd = INT64_MAX;
- int distortion, rate = 0;
- BLOCK *be = x->block + ib;
- BLOCKD *b = xd->block + ib;
- ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
- ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
-
- /*
- * The predictor buffer is a 2d buffer with a stride of 16. Create
- * a temp buffer that meets the stride requirements, but we are only
- * interested in the left 8x8 block
- * */
- DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16 * 8);
- DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
-
- // perform transformation of dimension 8x8
- // note the input and output index mapping
- int idx = (ib & 0x02) ? (ib + 2) : ib;
-
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
- for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
- int64_t this_rd;
- int rate_t;
-
- // FIXME rate for compound mode and second intrapred mode
- rate = mode_costs[mode];
- b->bmi.as_mode.first = mode;
-
-#if CONFIG_COMP_INTRA_PRED
- if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
- vp9_intra8x8_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- continue; // i.e. disable for now
- vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);
- }
-#endif
-
- vp9_subtract_4b_c(be, b, 16);
-
- if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
- TX_TYPE tx_type = get_tx_type_8x8(xd, b);
- if (tx_type != DCT_DCT)
- vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
- else
- x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
- x->quantize_b_8x8(x->block + idx, xd->block + idx);
-
- // compute quantization mse of 8x8 block
- distortion = vp9_block_error_c((x->block + idx)->coeff,
- (xd->block + idx)->dqcoeff, 64);
- ta0 = a[vp9_block2above_8x8[idx]];
- tl0 = l[vp9_block2left_8x8[idx]];
-
- rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
- &ta0, &tl0, TX_8X8);
-
- rate += rate_t;
- ta1 = ta0;
- tl1 = tl0;
- } else {
- x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
- x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);
-
- x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,
- xd->block + ib, xd->block + ib + 1);
- x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,
- xd->block + ib + 4, xd->block + ib + 5);
-
- distortion = vp9_block_error_c((x->block + ib)->coeff,
- (xd->block + ib)->dqcoeff, 16);
- distortion += vp9_block_error_c((x->block + ib + 1)->coeff,
- (xd->block + ib + 1)->dqcoeff, 16);
- distortion += vp9_block_error_c((x->block + ib + 4)->coeff,
- (xd->block + ib + 4)->dqcoeff, 16);
- distortion += vp9_block_error_c((x->block + ib + 5)->coeff,
- (xd->block + ib + 5)->dqcoeff, 16);
-
- ta0 = a[vp9_block2above[ib]];
- ta1 = a[vp9_block2above[ib + 1]];
- tl0 = l[vp9_block2left[ib]];
- tl1 = l[vp9_block2left[ib + 4]];
- rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
- &ta0, &tl0, TX_4X4);
- rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
- &ta1, &tl0, TX_4X4);
- rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,
- &ta0, &tl1, TX_4X4);
- rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,
- &ta1, &tl1, TX_4X4);
- rate += rate_t;
- }
-
- distortion >>= 2;
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
- if (this_rd < best_rd) {
- *bestrate = rate;
- *bestratey = rate_t;
- *bestdistortion = distortion;
- besta0 = ta0;
- besta1 = ta1;
- bestl0 = tl0;
- bestl1 = tl1;
- best_rd = this_rd;
- *best_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
- *best_second_mode = mode2;
-#endif
- copy_predictor_8x8(best_predictor, b->predictor);
- vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
- vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
-#if CONFIG_COMP_INTRA_PRED
- }
-#endif
- }
- }
- b->bmi.as_mode.first = (*best_mode);
-#if CONFIG_COMP_INTRA_PRED
- b->bmi.as_mode.second = (*best_second_mode);
-#endif
- vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
-
- if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
- a[vp9_block2above_8x8[idx]] = besta0;
- a[vp9_block2above_8x8[idx] + 1] = besta1;
- l[vp9_block2left_8x8[idx]] = bestl0;
- l[vp9_block2left_8x8[idx] + 1] = bestl1;
- } else {
- a[vp9_block2above[ib]] = besta0;
- a[vp9_block2above[ib + 1]] = besta1;
- l[vp9_block2left[ib]] = bestl0;
- l[vp9_block2left[ib + 4]] = bestl1;
- }
-
- return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
- int *Rate, int *rate_y,
- int *Distortion, int64_t best_rd) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- int i, ib;
- int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
- int distortion = 0;
- int tot_rate_y = 0;
- long long total_rd = 0;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
- int *i8x8mode_costs;
-
- vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- xd->mode_info_context->mbmi.mode = I8X8_PRED;
- i8x8mode_costs = mb->i8x8_mode_costs;
-
- for (i = 0; i < 4; i++) {
- MODE_INFO *const mic = xd->mode_info_context;
- B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
- B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
- int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
- ib = vp9_i8x8_block[i];
- total_rd += rd_pick_intra8x8block(
- cpi, mb, ib, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
- & best_second_mode,
-#endif
- i8x8mode_costs, ta, tl, &r, &ry, &d);
- cost += r;
- distortion += d;
- tot_rate_y += ry;
- mic->bmi[ib].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
- mic->bmi[ib].as_mode.second = best_second_mode;
-#endif
- }
- *Rate = cost;
- *rate_y += tot_rate_y;
- *Distortion = distortion;
- return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int rd_cost_mbuv(MACROBLOCK *mb) {
- int b;
- int cost = 0;
- MACROBLOCKD *xd = &mb->e_mbd;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
-
- vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
-
- for (b = 16; b < 24; b++)
- cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
- ta + vp9_block2above[b], tl + vp9_block2left[b],
- TX_4X4);
-
- return cost;
-}
-
-
-static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int *distortion, int fullpixel, int *skip) {
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
-
- vp9_transform_mbuv_4x4(x);
- vp9_quantize_mbuv_4x4(x);
-
- *rate = rd_cost_mbuv(x);
- *distortion = vp9_mbuverror(x) / 4;
- *skip = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
- return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
- int b;
- int cost = 0;
- MACROBLOCKD *xd = &mb->e_mbd;
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
-
- if (backup) {
- vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- } else {
- ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
- tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
- }
-
- for (b = 16; b < 24; b += 4)
- cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
- ta + vp9_block2above_8x8[b],
- tl + vp9_block2left_8x8[b], TX_8X8);
-
- return cost;
-}
-
-#if CONFIG_SUPERBLOCKS
-static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int *distortion, int fullpixel, int *skip) {
- MACROBLOCKD *xd = &x->e_mbd;
- int n, r = 0, d = 0;
- const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
- int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
- int skippable = 1;
- ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
- ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
- ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-
- memcpy(t_above, xd->above_context, sizeof(t_above));
- memcpy(t_left, xd->left_context, sizeof(t_left));
-
- for (n = 0; n < 4; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
-
- vp9_subtract_mbuv_s_c(x->src_diff,
- usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- src_uv_stride,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- dst_uv_stride);
-
- vp9_transform_mbuv_8x8(x);
- vp9_quantize_mbuv_8x8(x);
-
- xd->above_context = ta + x_idx;
- xd->left_context = tl + y_idx;
- r += rd_cost_mbuv_8x8(x, 0);
- d += vp9_mbuverror(x) / 4;
- skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
- }
-
- *rate = r;
- *distortion = d;
- if (skip) *skip = skippable;
- xd->left_context = tl;
- xd->above_context = ta;
- memcpy(xd->above_context, t_above, sizeof(t_above));
- memcpy(xd->left_context, t_left, sizeof(t_left));
-
- return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-#endif
-
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int *distortion, int fullpixel, int *skip) {
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
-
- vp9_transform_mbuv_8x8(x);
- vp9_quantize_mbuv_8x8(x);
-
- *rate = rd_cost_mbuv_8x8(x, 1);
- *distortion = vp9_mbuverror(x) / 4;
- *skip = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
-
- return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-
-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int *distortion, int *skippable, int fullpixel) {
- vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
-
- vp9_transform_mbuv_4x4(x);
- vp9_quantize_mbuv_4x4(x);
-
- *rate = rd_cost_mbuv(x);
- *distortion = vp9_mbuverror(x) / 4;
- *skippable = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
- return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
- MACROBLOCK *x,
- int *rate,
- int *rate_tokenonly,
- int *distortion,
- int *skippable) {
- MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
- MB_PREDICTION_MODE mode2;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- int64_t best_rd = INT64_MAX;
- int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
- int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
- for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
- int rate;
- int distortion;
- int64_t this_rd;
-
- mbmi->uv_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_uv_mode = mode2;
- if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
- vp9_build_intra_predictors_mbuv(&x->e_mbd);
-#if CONFIG_COMP_INTRA_PRED
- } else {
- continue;
- vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);
- }
-#endif
-
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
- vp9_transform_mbuv_4x4(x);
- vp9_quantize_mbuv_4x4(x);
-
- rate_to = rd_cost_mbuv(x);
- rate = rate_to
- + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
- distortion = vp9_mbuverror(x) / 4;
-
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
- if (this_rd < best_rd) {
- skip = vp9_mbuv_is_skippable_4x4(xd);
- best_rd = this_rd;
- d = distortion;
- r = rate;
- *rate_tokenonly = rate_to;
- mode_selected = mode;
-#if CONFIG_COMP_INTRA_PRED
- mode2_selected = mode2;
- }
-#endif
- }
- }
-
- *rate = r;
- *distortion = d;
- *skippable = skip;
-
- mbmi->uv_mode = mode_selected;
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_uv_mode = mode2_selected;
-#endif
-}
-
-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
- MACROBLOCK *x,
- int *rate,
- int *rate_tokenonly,
- int *distortion,
- int *skippable) {
- MACROBLOCKD *xd = &x->e_mbd;
- MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- int64_t best_rd = INT64_MAX;
- int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
- int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
- int rate;
- int distortion;
- int64_t this_rd;
-
- mbmi->uv_mode = mode;
- vp9_build_intra_predictors_mbuv(&x->e_mbd);
- vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
- x->e_mbd.predictor, x->src.uv_stride);
- vp9_transform_mbuv_8x8(x);
-
- vp9_quantize_mbuv_8x8(x);
-
- rate_to = rd_cost_mbuv_8x8(x, 1);
- rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
- distortion = vp9_mbuverror(x) / 4;
- this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
- if (this_rd < best_rd) {
- skip = vp9_mbuv_is_skippable_8x8(xd);
- best_rd = this_rd;
- d = distortion;
- r = rate;
- *rate_tokenonly = rate_to;
- mode_selected = mode;
- }
- }
- *rate = r;
- *distortion = d;
- *skippable = skip;
- mbmi->uv_mode = mode_selected;
-}
-
-#if CONFIG_SUPERBLOCKS
-static void super_block_uvrd_8x8(MACROBLOCK *x,
- int *rate,
- int *distortion,
- const VP9_ENCODER_RTCD *rtcd,
- int *skippable) {
- MACROBLOCKD *const xd = &x->e_mbd;
- int d = 0, r = 0, n, s = 1;
- const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
- int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
- ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
- ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
- ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-
- memcpy(t_above, xd->above_context, sizeof(t_above));
- memcpy(t_left, xd->left_context, sizeof(t_left));
-
- for (n = 0; n < 4; n++) {
- int x_idx = n & 1, y_idx = n >> 1;
-
- vp9_subtract_mbuv_s_c(x->src_diff,
- usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
- src_uv_stride,
- udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
- dst_uv_stride);
- vp9_transform_mbuv_8x8(x);
- vp9_quantize_mbuv_8x8(x);
- s &= vp9_mbuv_is_skippable_8x8(xd);
-
- d += vp9_mbuverror(x) >> 2;
- xd->above_context = ta + x_idx;
- xd->left_context = tl + y_idx;
- r += rd_cost_mbuv_8x8(x, 0);
- }
-
- xd->above_context = ta;
- xd->left_context = tl;
- *distortion = d;
- *rate = r;
- *skippable = s;
-
- xd->left_context = tl;
- xd->above_context = ta;
- memcpy(xd->above_context, t_above, sizeof(t_above));
- memcpy(xd->left_context, t_left, sizeof(t_left));
-}
-
-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
- MACROBLOCK *x,
- int *rate,
- int *rate_tokenonly,
- int *distortion,
- int *skippable) {
- MB_PREDICTION_MODE mode;
- MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
- int64_t best_rd = INT64_MAX, this_rd;
- int this_rate_tokenonly, this_rate;
- int this_distortion, s;
-
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
- x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
- vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-
- super_block_uvrd_8x8(x, &this_rate_tokenonly,
- &this_distortion, IF_RTCD(&cpi->rtcd), &s);
- this_rate = this_rate_tokenonly +
- x->mbmode_cost[x->e_mbd.frame_type]
- [x->e_mbd.mode_info_context->mbmi.mode];
- this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
- if (this_rd < best_rd) {
- mode_selected = mode;
- best_rd = this_rd;
- *rate = this_rate;
- *rate_tokenonly = this_rate_tokenonly;
- *distortion = this_distortion;
- *skippable = s;
- }
- }
-
- x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
-
- return best_rd;
-}
-#endif
-
-int vp9_cost_mv_ref(VP9_COMP *cpi,
- MB_PREDICTION_MODE m,
- const int near_mv_ref_ct[4]) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
-
- // If the mode coding is done entirely at the segment level
- // we should not account for it at the per mb level in rd code.
- // Note that if the segment level coding is expanded from single mode
- // to multiple mode masks as per reference frame coding we will need
- // to do something different here.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
- VP9_COMMON *pc = &cpi->common;
-
- vp9_prob p [VP9_MVREFS - 1];
- assert(NEARESTMV <= m && m <= SPLITMV);
- vp9_mv_ref_probs(pc, p, near_mv_ref_ct);
- return cost_token(vp9_mv_ref_tree, p,
- vp9_mv_ref_encoding_array - NEARESTMV + m);
- } else
- return 0;
-}
-
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
- x->e_mbd.mode_info_context->mbmi.mode = mb;
- x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
-}
-
-static int labels2mode(
- MACROBLOCK *x,
- int const *labelings, int which_label,
- B_PREDICTION_MODE this_mode,
- int_mv *this_mv, int_mv *this_second_mv,
- int_mv seg_mvs[MAX_REF_FRAMES - 1],
- int_mv *best_ref_mv,
- int_mv *second_best_ref_mv,
- DEC_MVCOSTS) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO *const mic = xd->mode_info_context;
- MB_MODE_INFO * mbmi = &mic->mbmi;
- const int mis = xd->mode_info_stride;
-
- int i, cost = 0, thismvcost = 0;
-
- /* We have to be careful retrieving previously-encoded motion vectors.
- Ones from this macroblock have to be pulled from the BLOCKD array
- as they have not yet made it to the bmi array in our MB_MODE_INFO. */
- for (i = 0; i < 16; ++i) {
- BLOCKD *const d = xd->block + i;
- const int row = i >> 2, col = i & 3;
-
- B_PREDICTION_MODE m;
-
- if (labelings[i] != which_label)
- continue;
-
- if (col && labelings[i] == labelings[i - 1])
- m = LEFT4X4;
- else if (row && labelings[i] == labelings[i - 4])
- m = ABOVE4X4;
- else {
- // the only time we should do costing for new motion vector or mode
- // is when we are on a new label (jbb May 08, 2007)
- switch (m = this_mode) {
- case NEW4X4 :
- if (mbmi->second_ref_frame) {
- this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
- this_second_mv->as_int =
- seg_mvs[mbmi->second_ref_frame - 1].as_int;
- }
-
- thismvcost = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,
- 102, xd->allow_high_precision_mv);
- if (mbmi->second_ref_frame) {
- thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
- MVCOSTS, 102,
- xd->allow_high_precision_mv);
- }
- break;
- case LEFT4X4:
- this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);
- if (mbmi->second_ref_frame)
- this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);
- break;
- case ABOVE4X4:
- this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);
- if (mbmi->second_ref_frame)
- this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);
- break;
- case ZERO4X4:
- this_mv->as_int = 0;
- if (mbmi->second_ref_frame)
- this_second_mv->as_int = 0;
- break;
- default:
- break;
- }
-
- if (m == ABOVE4X4) { // replace above with left if same
- int_mv left_mv, left_second_mv;
-
- left_second_mv.as_int = 0;
- left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
- left_block_mv(mic, i);
- if (mbmi->second_ref_frame)
- left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
- left_block_second_mv(mic, i);
-
- if (left_mv.as_int == this_mv->as_int &&
- (!mbmi->second_ref_frame ||
- left_second_mv.as_int == this_second_mv->as_int))
- m = LEFT4X4;
- }
-
- cost = x->inter_bmode_costs[ m];
- }
-
- d->bmi.as_mv.first.as_int = this_mv->as_int;
- if (mbmi->second_ref_frame)
- d->bmi.as_mv.second.as_int = this_second_mv->as_int;
-
- x->partition_info->bmi[i].mode = m;
- x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
- if (mbmi->second_ref_frame)
- x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
- }
-
- cost += thismvcost;
- return cost;
-}
-
-static int64_t encode_inter_mb_segment(MACROBLOCK *x,
- int const *labels,
- int which_label,
- int *labelyrate,
- int *distortion,
- ENTROPY_CONTEXT *ta,
- ENTROPY_CONTEXT *tl,
- const VP9_ENCODER_RTCD *rtcd) {
- int i;
- MACROBLOCKD *xd = &x->e_mbd;
-
- *labelyrate = 0;
- *distortion = 0;
- for (i = 0; i < 16; i++) {
- if (labels[i] == which_label) {
- BLOCKD *bd = &x->e_mbd.block[i];
- BLOCK *be = &x->block[i];
- int thisdistortion;
-
- vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);
- if (xd->mode_info_context->mbmi.second_ref_frame)
- vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
- vp9_subtract_b(be, bd, 16);
- x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
- x->quantize_b_4x4(be, bd);
- thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
- *distortion += thisdistortion;
- *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
- ta + vp9_block2above[i],
- tl + vp9_block2left[i], TX_4X4);
- }
- }
- *distortion >>= 2;
- return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
- int const *labels,
- int which_label,
- int *labelyrate,
- int *distortion,
- int64_t *otherrd,
- ENTROPY_CONTEXT *ta,
- ENTROPY_CONTEXT *tl,
- const VP9_ENCODER_RTCD *rtcd) {
- int i, j;
- MACROBLOCKD *xd = &x->e_mbd;
- const int iblock[4] = { 0, 1, 4, 5 };
- int othercost = 0, otherdist = 0;
- ENTROPY_CONTEXT_PLANES tac, tlc;
- ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
- *tlcp = (ENTROPY_CONTEXT *) &tlc;
-
- if (otherrd) {
- memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
- memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
- }
-
- *distortion = 0;
- *labelyrate = 0;
- for (i = 0; i < 4; i++) {
- int ib = vp9_i8x8_block[i];
-
- if (labels[ib] == which_label) {
- int idx = (ib & 8) + ((ib & 2) << 1);
- BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
- BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
- int thisdistortion;
-
- vp9_build_inter_predictors4b(xd, bd, 16);
- if (xd->mode_info_context->mbmi.second_ref_frame)
- vp9_build_2nd_inter_predictors4b(xd, bd, 16);
- vp9_subtract_4b_c(be, bd, 16);
-
- if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
- if (otherrd) {
- x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
- x->quantize_b_8x8(be2, bd2);
- thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
- otherdist += thisdistortion;
- othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
- tacp + vp9_block2above_8x8[idx],
- tlcp + vp9_block2left_8x8[idx], TX_8X8);
- }
- for (j = 0; j < 4; j += 2) {
- bd = &xd->block[ib + iblock[j]];
- be = &x->block[ib + iblock[j]];
- x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
- x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
- thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
- *distortion += thisdistortion;
- *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
- ta + vp9_block2above[ib + iblock[j]],
- tl + vp9_block2left[ib + iblock[j]],
- TX_4X4);
- *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
- ta + vp9_block2above[ib + iblock[j] + 1],
- tl + vp9_block2left[ib + iblock[j]],
- TX_4X4);
- }
- } else /* 8x8 */ {
- if (otherrd) {
- for (j = 0; j < 4; j += 2) {
- BLOCKD *bd3 = &xd->block[ib + iblock[j]];
- BLOCK *be3 = &x->block[ib + iblock[j]];
- x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);
- x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);
- thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);
- otherdist += thisdistortion;
- othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,
- tacp + vp9_block2above[ib + iblock[j]],
- tlcp + vp9_block2left[ib + iblock[j]],
- TX_4X4);
- othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,
- tacp + vp9_block2above[ib + iblock[j] + 1],
- tlcp + vp9_block2left[ib + iblock[j]],
- TX_4X4);
- }
- }
- x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
- x->quantize_b_8x8(be2, bd2);
- thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
- *distortion += thisdistortion;
- *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
- ta + vp9_block2above_8x8[idx],
- tl + vp9_block2left_8x8[idx], TX_8X8);
- }
- }
- }
- *distortion >>= 2;
- if (otherrd) {
- otherdist >>= 2;
- *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
- }
- return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
-typedef struct {
- int_mv *ref_mv, *second_ref_mv;
- int_mv mvp;
-
- int64_t segment_rd;
- SPLITMV_PARTITIONING_TYPE segment_num;
- TX_SIZE txfm_size;
- int r;
- int d;
- int segment_yrate;
- B_PREDICTION_MODE modes[16];
- int_mv mvs[16], second_mvs[16];
- int eobs[16];
-
- int mvthresh;
- int *mdcounts;
-
- int_mv sv_mvp[4]; // save 4 mvp from 8x8
- int sv_istep[2]; // save 2 initial step_param for 16x8/8x16
-
-} BEST_SEG_INFO;
-
-static __inline
-int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
- int r = 0;
- r |= (mv->as_mv.row >> 3) < x->mv_row_min;
- r |= (mv->as_mv.row >> 3) > x->mv_row_max;
- r |= (mv->as_mv.col >> 3) < x->mv_col_min;
- r |= (mv->as_mv.col >> 3) > x->mv_col_max;
- return r;
-}
-
-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
- BEST_SEG_INFO *bsi,
- SPLITMV_PARTITIONING_TYPE segmentation,
- TX_SIZE tx_size, int64_t *otherrds,
- int64_t *rds, int *completed,
- /* 16 = n_blocks */
- int_mv seg_mvs[16 /* n_blocks */]
- [MAX_REF_FRAMES - 1]) {
- int i, j;
- int const *labels;
- int br = 0, bd = 0;
- B_PREDICTION_MODE this_mode;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
- int label_count;
- int64_t this_segment_rd = 0, other_segment_rd;
- int label_mv_thresh;
- int rate = 0;
- int sbr = 0, sbd = 0;
- int segmentyrate = 0;
- int best_eobs[16] = { 0 };
-
- vp9_variance_fn_ptr_t *v_fn_ptr;
-
- ENTROPY_CONTEXT_PLANES t_above, t_left;
- ENTROPY_CONTEXT *ta, *tl;
- ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
- ENTROPY_CONTEXT *ta_b, *tl_b;
-
- vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta = (ENTROPY_CONTEXT *)&t_above;
- tl = (ENTROPY_CONTEXT *)&t_left;
- ta_b = (ENTROPY_CONTEXT *)&t_above_b;
- tl_b = (ENTROPY_CONTEXT *)&t_left_b;
-
- v_fn_ptr = &cpi->fn_ptr[segmentation];
- labels = vp9_mbsplits[segmentation];
- label_count = vp9_mbsplit_count[segmentation];
-
- // 64 makes this threshold really big effectively
- // making it so that we very rarely check mvs on
- // segments. setting this to 1 would make mv thresh
- // roughly equal to what it is for macroblocks
- label_mv_thresh = 1 * bsi->mvthresh / label_count;
-
- // Segmentation method overheads
- rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
- vp9_mbsplit_encodings + segmentation);
- rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);
- this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
- br += rate;
- other_segment_rd = this_segment_rd;
-
- mbmi->txfm_size = tx_size;
- for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
- int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
- int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
- B_PREDICTION_MODE mode_selected = ZERO4X4;
- int bestlabelyrate = 0;
-
- // search for the best motion vector on this segment
- for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
- int64_t this_rd, other_rd;
- int distortion;
- int labelyrate;
- ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
- ENTROPY_CONTEXT *ta_s;
- ENTROPY_CONTEXT *tl_s;
-
- vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
-
- ta_s = (ENTROPY_CONTEXT *)&t_above_s;
- tl_s = (ENTROPY_CONTEXT *)&t_left_s;
-
- // motion search for newmv (single predictor case only)
- if (!mbmi->second_ref_frame && this_mode == NEW4X4) {
- int sseshift, n;
- int step_param = 0;
- int further_steps;
- int thissme, bestsme = INT_MAX;
- BLOCK *c;
- BLOCKD *e;
-
- /* Is the best so far sufficiently good that we cant justify doing
- * and new motion search. */
- if (best_label_rd < label_mv_thresh)
- break;
-
- if (cpi->compressor_speed) {
- if (segmentation == PARTITIONING_8X16 ||
- segmentation == PARTITIONING_16X8) {
- bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
- if (i == 1 && segmentation == PARTITIONING_16X8)
- bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
-
- step_param = bsi->sv_istep[i];
- }
-
- // use previous block's result as next block's MV predictor.
- if (segmentation == PARTITIONING_4X4 && i > 0) {
- bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
- if (i == 4 || i == 8 || i == 12)
- bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
- step_param = 2;
- }
- }
-
- further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
- {
- int sadpb = x->sadperbit4;
- int_mv mvp_full;
-
- mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
- mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
- // find first label
- n = vp9_mbsplit_offset[segmentation][i];
-
- c = &x->block[n];
- e = &x->e_mbd.block[n];
-
- bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,
- sadpb, further_steps, 0, v_fn_ptr,
- bsi->ref_mv, &mode_mv[NEW4X4]);
-
- sseshift = segmentation_to_sseshift[segmentation];
-
- // Should we do a full search (best quality only)
- if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
- /* Check if mvp_full is within the range. */
- clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
- x->mv_row_min, x->mv_row_max);
-
- thissme = cpi->full_search_sad(x, c, e, &mvp_full,
- sadpb, 16, v_fn_ptr,
- XMVCOST, bsi->ref_mv);
-
- if (thissme < bestsme) {
- bestsme = thissme;
- mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
- } else {
- /* The full search result is actually worse so re-instate the
- * previous best vector */
- e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
- }
- }
- }
-
- if (bestsme < INT_MAX) {
- int distortion;
- unsigned int sse;
- cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
- bsi->ref_mv, x->errorperbit, v_fn_ptr,
- XMVCOST, &distortion, &sse);
-
- // safe motion search result for use in compound prediction
- seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
- }
- } /* NEW4X4 */
- else if (mbmi->second_ref_frame && this_mode == NEW4X4) {
- /* motion search not completed? Then skip newmv for this block with
- * comppred */
- if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
- seg_mvs[i][mbmi->ref_frame - 1].as_int == INVALID_MV) {
- continue;
- }
- }
-
- rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
- &second_mode_mv[this_mode], seg_mvs[i],
- bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
-
- // Trap vectors that reach beyond the UMV borders
- if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
- ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
- ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
- ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
- continue;
- }
- if (mbmi->second_ref_frame &&
- mv_check_bounds(x, &second_mode_mv[this_mode]))
- continue;
-
- if (segmentation == PARTITIONING_4X4) {
- this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
- &distortion,
- ta_s, tl_s, IF_RTCD(&cpi->rtcd));
- other_rd = this_rd;
- } else {
- this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
- &distortion, &other_rd,
- ta_s, tl_s, IF_RTCD(&cpi->rtcd));
- }
- this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
- rate += labelyrate;
-
- if (this_rd < best_label_rd) {
- sbr = rate;
- sbd = distortion;
- bestlabelyrate = labelyrate;
- mode_selected = this_mode;
- best_label_rd = this_rd;
- if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
- for (j = 0; j < 16; j++)
- if (labels[j] == i)
- best_eobs[j] = x->e_mbd.block[j].eob;
- } else {
- for (j = 0; j < 4; j++) {
- int ib = vp9_i8x8_block[j], idx = j * 4;
-
- if (labels[ib] == i)
- best_eobs[idx] = x->e_mbd.block[idx].eob;
- }
- }
- if (other_rd < best_other_rd)
- best_other_rd = other_rd;
-
- vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
-
- }
- } /*for each 4x4 mode*/
-
- vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
-
- labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
- &second_mode_mv[mode_selected], seg_mvs[i],
- bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
-
- br += sbr;
- bd += sbd;
- segmentyrate += bestlabelyrate;
- this_segment_rd += best_label_rd;
- other_segment_rd += best_other_rd;
- if (rds)
- rds[i] = this_segment_rd;
- if (otherrds)
- otherrds[i] = other_segment_rd;
- } /* for each label */
-
- if (this_segment_rd < bsi->segment_rd) {
- bsi->r = br;
- bsi->d = bd;
- bsi->segment_yrate = segmentyrate;
- bsi->segment_rd = this_segment_rd;
- bsi->segment_num = segmentation;
- bsi->txfm_size = mbmi->txfm_size;
-
- // store everything needed to come back to this!!
- for (i = 0; i < 16; i++) {
- BLOCKD *bd = &x->e_mbd.block[i];
-
- bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
- if (mbmi->second_ref_frame)
- bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
- bsi->modes[i] = x->partition_info->bmi[i].mode;
- bsi->eobs[i] = best_eobs[i];
- }
- }
-
- if (completed) {
- *completed = i;
- }
-}
-
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
- BEST_SEG_INFO *bsi,
- unsigned int segmentation,
- /* 16 = n_blocks */
- int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
- int64_t txfm_cache[NB_TXFM_MODES]) {
- int i, n, c = vp9_mbsplit_count[segmentation];
-
- if (segmentation == PARTITIONING_4X4) {
- int64_t rd[16];
-
- rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
- rd, &n, seg_mvs);
- if (n == c) {
- for (i = 0; i < NB_TXFM_MODES; i++) {
- if (rd[c - 1] < txfm_cache[i])
- txfm_cache[i] = rd[c - 1];
- }
- }
- } else {
- int64_t diff, base_rd;
- int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
- int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
- if (cpi->common.txfm_mode == TX_MODE_SELECT) {
- int64_t rd4x4[4], rd8x8[4];
- int n4x4, n8x8, nmin;
- BEST_SEG_INFO bsi4x4, bsi8x8;
-
- /* factor in cost of cost4x4/8x8 in decision */
- vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
- vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
- rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
- TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
- rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
- TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
- if (bsi4x4.segment_num == segmentation) {
- bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
- if (bsi4x4.segment_rd < bsi->segment_rd)
- vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
- }
- if (bsi8x8.segment_num == segmentation) {
- bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
- if (bsi8x8.segment_rd < bsi->segment_rd)
- vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
- }
- n = n4x4 > n8x8 ? n4x4 : n8x8;
- if (n == c) {
- nmin = n4x4 < n8x8 ? n4x4 : n8x8;
- diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
- if (n == n4x4) {
- base_rd = rd4x4[c - 1];
- } else {
- base_rd = rd8x8[c - 1] - diff;
- }
- }
- } else {
- int64_t rd[4], otherrd[4];
-
- if (cpi->common.txfm_mode == ONLY_4X4) {
- rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
- rd, &n, seg_mvs);
- if (n == c) {
- base_rd = rd[c - 1];
- diff = otherrd[c - 1] - rd[c - 1];
- }
- } else /* use 8x8 transform */ {
- rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
- rd, &n, seg_mvs);
- if (n == c) {
- diff = rd[c - 1] - otherrd[c - 1];
- base_rd = otherrd[c - 1];
- }
- }
- }
-
- if (n == c) {
- if (base_rd < txfm_cache[ONLY_4X4]) {
- txfm_cache[ONLY_4X4] = base_rd;
- }
- if (base_rd + diff < txfm_cache[1]) {
- txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
- }
- if (diff < 0) {
- base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
- } else {
- base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
- }
- if (base_rd < txfm_cache[TX_MODE_SELECT]) {
- txfm_cache[TX_MODE_SELECT] = base_rd;
- }
- }
- }
-}
-
-static __inline void cal_step_param(int sr, int *sp) {
- int step = 0;
-
- if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
- else if (sr < 1) sr = 1;
-
- while (sr >>= 1)
- step++;
-
- *sp = MAX_MVSEARCH_STEPS - 1 - step;
-}
-
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *best_ref_mv,
- int_mv *second_best_ref_mv,
- int64_t best_rd,
- int *mdcounts,
- int *returntotrate,
- int *returnyrate,
- int *returndistortion,
- int *skippable, int mvthresh,
- int_mv seg_mvs[NB_PARTITIONINGS]
- [16 /* n_blocks */]
- [MAX_REF_FRAMES - 1],
- int64_t txfm_cache[NB_TXFM_MODES]) {
- int i;
- BEST_SEG_INFO bsi;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
- vpx_memset(&bsi, 0, sizeof(bsi));
- for (i = 0; i < NB_TXFM_MODES; i++)
- txfm_cache[i] = INT64_MAX;
-
- bsi.segment_rd = best_rd;
- bsi.ref_mv = best_ref_mv;
- bsi.second_ref_mv = second_best_ref_mv;
- bsi.mvp.as_int = best_ref_mv->as_int;
- bsi.mvthresh = mvthresh;
- bsi.mdcounts = mdcounts;
- bsi.txfm_size = TX_4X4;
-
- for (i = 0; i < 16; i++)
- bsi.modes[i] = ZERO4X4;
-
- if (cpi->compressor_speed == 0) {
- /* for now, we will keep the original segmentation order
- when in best quality mode */
- rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
- seg_mvs[PARTITIONING_16X8], txfm_cache);
- rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
- seg_mvs[PARTITIONING_8X16], txfm_cache);
- rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
- seg_mvs[PARTITIONING_8X8], txfm_cache);
- rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
- seg_mvs[PARTITIONING_4X4], txfm_cache);
- } else {
- int sr;
-
- rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
- seg_mvs[PARTITIONING_8X8], txfm_cache);
-
- if (bsi.segment_rd < best_rd) {
- int tmp_col_min = x->mv_col_min;
- int tmp_col_max = x->mv_col_max;
- int tmp_row_min = x->mv_row_min;
- int tmp_row_max = x->mv_row_max;
-
- vp9_clamp_mv_min_max(x, best_ref_mv);
-
- /* Get 8x8 result */
- bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
- bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
- bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
- bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
- /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
- * according to the closeness of 2 MV. */
- /* block 8X16 */
- sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
- (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
- cal_step_param(sr, &bsi.sv_istep[0]);
-
- sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
- (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
- cal_step_param(sr, &bsi.sv_istep[1]);
-
- rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
- seg_mvs[PARTITIONING_8X16], txfm_cache);
-
- /* block 16X8 */
- sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
- (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
- cal_step_param(sr, &bsi.sv_istep[0]);
-
- sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
- (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
- cal_step_param(sr, &bsi.sv_istep[1]);
-
- rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
- seg_mvs[PARTITIONING_16X8], txfm_cache);
-
- /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
- /* Not skip 4x4 if speed=0 (good quality) */
- if (cpi->sf.no_skip_block4x4_search ||
- bsi.segment_num == PARTITIONING_8X8) {
- /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
- bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
- rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
- seg_mvs[PARTITIONING_4X4], txfm_cache);
- }
-
- /* restore UMV window */
- x->mv_col_min = tmp_col_min;
- x->mv_col_max = tmp_col_max;
- x->mv_row_min = tmp_row_min;
- x->mv_row_max = tmp_row_max;
- }
- }
-
- /* set it to the best */
- for (i = 0; i < 16; i++) {
- BLOCKD *bd = &x->e_mbd.block[i];
-
- bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
- if (mbmi->second_ref_frame)
- bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
- bd->eob = bsi.eobs[i];
- }
-
- *returntotrate = bsi.r;
- *returndistortion = bsi.d;
- *returnyrate = bsi.segment_yrate;
- *skippable = bsi.txfm_size == TX_4X4 ?
- vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :
- vp9_mby_is_skippable_8x8(&x->e_mbd, 0);
-
- /* save partitions */
- mbmi->txfm_size = bsi.txfm_size;
- mbmi->partitioning = bsi.segment_num;
- x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
-
- for (i = 0; i < x->partition_info->count; i++) {
- int j;
-
- j = vp9_mbsplit_offset[bsi.segment_num][i];
-
- x->partition_info->bmi[i].mode = bsi.modes[j];
- x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
- if (mbmi->second_ref_frame)
- x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
- }
- /*
- * used to set mbmi->mv.as_int
- */
- x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
- if (mbmi->second_ref_frame)
- x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
-
- return bsi.segment_rd;
-}
-
-/* Order arr in increasing order, original position stored in idx */
-static void insertsortmv(int arr[], int len) {
- int i, j, k;
-
- for (i = 1; i <= len - 1; i++) {
- for (j = 0; j < i; j++) {
- if (arr[j] > arr[i]) {
- int temp;
-
- temp = arr[i];
-
- for (k = i; k > j; k--)
- arr[k] = arr[k - 1];
-
- arr[j] = temp;
- }
- }
- }
-}
-
-static void insertsortsad(int arr[], int idx[], int len) {
- int i, j, k;
-
- for (i = 1; i <= len - 1; i++) {
- for (j = 0; j < i; j++) {
- if (arr[j] > arr[i]) {
- int temp, tempi;
-
- temp = arr[i];
- tempi = idx[i];
-
- for (k = i; k > j; k--) {
- arr[k] = arr[k - 1];
- idx[k] = idx[k - 1];
- }
-
- arr[j] = temp;
- idx[j] = tempi;
- }
- }
- }
-}
-
-// The improved MV prediction
-void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
- int_mv *mvp, int refframe, int *ref_frame_sign_bias,
- int *sr, int near_sadidx[]) {
- const MODE_INFO *above = here - xd->mode_info_stride;
- const MODE_INFO *left = here - 1;
- const MODE_INFO *aboveleft = above - 1;
- int_mv near_mvs[8];
- int near_ref[8];
- int_mv mv;
- int vcnt = 0;
- int find = 0;
- int mb_offset;
-
- int mvx[8];
- int mvy[8];
- int i;
-
- mv.as_int = 0;
-
- if (here->mbmi.ref_frame != INTRA_FRAME) {
- near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
- near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
-
- // read in 3 nearby block's MVs from current frame as prediction candidates.
- if (above->mbmi.ref_frame != INTRA_FRAME) {
- near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = above->mbmi.ref_frame;
- }
- vcnt++;
- if (left->mbmi.ref_frame != INTRA_FRAME) {
- near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = left->mbmi.ref_frame;
- }
- vcnt++;
- if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {
- near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;
- mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = aboveleft->mbmi.ref_frame;
- }
- vcnt++;
-
- // read in 5 nearby block's MVs from last frame.
- if (cpi->common.last_frame_type != KEY_FRAME) {
- mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);
-
- // current in last frame
- if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {
- near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
- mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = cpi->lf_ref_frame[mb_offset];
- }
- vcnt++;
-
- // above in last frame
- if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {
- near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;
- mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];
- }
- vcnt++;
-
- // left in last frame
- if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {
- near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;
- mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - 1];
- }
- vcnt++;
-
- // right in last frame
- if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {
- near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;
- mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + 1];
- }
- vcnt++;
-
- // below in last frame
- if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {
- near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;
- mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
- near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];
- }
- vcnt++;
- }
-
- for (i = 0; i < vcnt; i++) {
- if (near_ref[near_sadidx[i]] != INTRA_FRAME) {
- if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {
- mv.as_int = near_mvs[near_sadidx[i]].as_int;
- find = 1;
- if (i < 3)
- *sr = 3;
- else
- *sr = 2;
- break;
- }
- }
- }
-
- if (!find) {
- for (i = 0; i < vcnt; i++) {
- mvx[i] = near_mvs[i].as_mv.row;
- mvy[i] = near_mvs[i].as_mv.col;
- }
-
- insertsortmv(mvx, vcnt);
- insertsortmv(mvy, vcnt);
- mv.as_mv.row = mvx[vcnt / 2];
- mv.as_mv.col = mvy[vcnt / 2];
-
- find = 1;
- // sr is set to 0 to allow calling function to decide the search range.
- *sr = 0;
- }
- }
-
- /* Set up return values */
- mvp->as_int = mv.as_int;
- clamp_mv2(mvp, xd);
-}
-
-static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
- int recon_yoffset, int near_sadidx[],
- enum BlockSize block_size) {
- /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,
- * 5-lf left, 6-lf right, 7-lf below */
- int near_sad[8] = {0};
- BLOCK *b = &x->block[0];
- unsigned char *src_y_ptr = *(b->base_src);
- const unsigned char *dst_y_ptr = xd->dst.y_buffer;
- const int bs = (block_size == BLOCK_16X16) ? 16 : 32;
- const int dst_y_str = xd->dst.y_stride;
-
- // calculate sad for current frame 3 nearby MBs.
- if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {
- near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
- } else if (xd->mb_to_top_edge == 0) {
- // only has left MB for sad calculation.
- near_sad[0] = near_sad[2] = INT_MAX;
- near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- dst_y_ptr - bs,
- dst_y_str, 0x7fffffff);
- } else if (xd->mb_to_left_edge == 0) {
- // only has left MB for sad calculation.
- near_sad[1] = near_sad[2] = INT_MAX;
- near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- dst_y_ptr - dst_y_str * bs,
- dst_y_str, 0x7fffffff);
- } else {
- near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- dst_y_ptr - dst_y_str * bs,
- dst_y_str, 0x7fffffff);
- near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- dst_y_ptr - bs,
- dst_y_str, 0x7fffffff);
- near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- dst_y_ptr - dst_y_str * bs - bs,
- dst_y_str, 0x7fffffff);
- }
-
- if (cpi->common.last_frame_type != KEY_FRAME) {
- // calculate sad for last frame 5 nearby MBs.
- unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
- const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
-
- if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;
- if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;
- if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;
- if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;
-
- near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- pre_y_buffer,
- pre_y_str, 0x7fffffff);
- if (near_sad[4] != INT_MAX)
- near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- pre_y_buffer - pre_y_str * bs,
- pre_y_str, 0x7fffffff);
- if (near_sad[5] != INT_MAX)
- near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- pre_y_buffer - bs,
- pre_y_str, 0x7fffffff);
- if (near_sad[6] != INT_MAX)
- near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- pre_y_buffer + bs,
- pre_y_str, 0x7fffffff);
- if (near_sad[7] != INT_MAX)
- near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
- pre_y_buffer + pre_y_str * bs,
- pre_y_str, 0x7fffffff);
- }
-
- if (cpi->common.last_frame_type != KEY_FRAME) {
- insertsortsad(near_sad, near_sadidx, 8);
- } else {
- insertsortsad(near_sad, near_sadidx, 3);
- }
-}
-
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {
- int i;
- MACROBLOCKD *xd = &x->e_mbd;
- for (i = 0; i < 4; i++) {
- int ib = vp9_i8x8_block[i];
- xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];
- xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];
- xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];
- xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];
-#if CONFIG_COMP_INTRA_PRED
- xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];
- xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];
- xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];
- xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];
-#endif
- // printf("%d,%d,%d,%d %d,%d,%d,%d\n",
- // modes[0][0], modes[0][1], modes[0][2], modes[0][3],
- // modes[1][0], modes[1][1], modes[1][2], modes[1][3]);
- }
-
- for (i = 0; i < 16; i++) {
- xd->block[i].bmi = xd->mode_info_context->bmi[i];
- }
-}
-
-extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
- int norm_cnt[MAX_REF_FRAMES];
- const int *const rfct = cpi->count_mb_ref_frame_usage;
- int intra_count = rfct[INTRA_FRAME];
- int last_count = rfct[LAST_FRAME];
- int gf_count = rfct[GOLDEN_FRAME];
- int arf_count = rfct[ALTREF_FRAME];
-
- // Work out modified reference frame probabilities to use where prediction
- // of the reference frame fails
- if (pred_ref == INTRA_FRAME) {
- norm_cnt[0] = 0;
- norm_cnt[1] = last_count;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, mod_refprobs);
- mod_refprobs[0] = 0; // This branch implicit
- } else if (pred_ref == LAST_FRAME) {
- norm_cnt[0] = intra_count;
- norm_cnt[1] = 0;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, mod_refprobs);
- mod_refprobs[1] = 0; // This branch implicit
- } else if (pred_ref == GOLDEN_FRAME) {
- norm_cnt[0] = intra_count;
- norm_cnt[1] = last_count;
- norm_cnt[2] = 0;
- norm_cnt[3] = arf_count;
- vp9_calc_ref_probs(norm_cnt, mod_refprobs);
- mod_refprobs[2] = 0; // This branch implicit
- } else {
- norm_cnt[0] = intra_count;
- norm_cnt[1] = last_count;
- norm_cnt[2] = gf_count;
- norm_cnt[3] = 0;
- vp9_calc_ref_probs(norm_cnt, mod_refprobs);
- mod_refprobs[2] = 0; // This branch implicit
- }
-}
-
-static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {
- unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
- unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
- // weight is 16-bit fixed point, so this basically calculates:
- // 0.5 + weight * cost1 + (1.0 - weight) * cost0
- return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
-}
-
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- vp9_prob *mod_refprobs;
-
- unsigned int cost;
- int pred_ref;
- int pred_flag;
- int pred_ctx;
- int i;
- int tot_count;
-
- vp9_prob pred_prob, new_pred_prob;
- int seg_ref_active;
- int seg_ref_count = 0;
- seg_ref_active = vp9_segfeature_active(xd,
- segment_id,
- SEG_LVL_REF_FRAME);
-
- if (seg_ref_active) {
- seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
- vp9_check_segref(xd, segment_id, LAST_FRAME) +
- vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
- vp9_check_segref(xd, segment_id, ALTREF_FRAME);
- }
-
- // Get the predicted reference for this mb
- pred_ref = vp9_get_pred_ref(cm, xd);
-
- // Get the context probability for the prediction flag (based on last frame)
- pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
- // Predict probability for current frame based on stats so far
- pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
- tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];
- if (tot_count) {
- new_pred_prob =
- (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;
- new_pred_prob += !new_pred_prob;
- } else
- new_pred_prob = 128;
-
- // Get the set of probabilities to use if prediction fails
- mod_refprobs = cm->mod_refprobs[pred_ref];
-
- // For each possible selected reference frame work out a cost.
- for (i = 0; i < MAX_REF_FRAMES; i++) {
- if (seg_ref_active && seg_ref_count == 1) {
- cost = 0;
- } else {
- pred_flag = (i == pred_ref);
-
- // Get the prediction for the current mb
- cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
- pred_flag, cpi->seg0_progress);
- if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
-
- // for incorrectly predicted cases
- if (! pred_flag) {
- vp9_prob curframe_mod_refprobs[3];
-
- if (cpi->seg0_progress) {
- estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);
- } else {
- vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));
- }
-
- cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,
- (i != INTRA_FRAME), cpi->seg0_progress);
- if (i != INTRA_FRAME) {
- cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,
- (i != LAST_FRAME), cpi->seg0_progress);
- if (i != LAST_FRAME) {
- cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,
- (i != GOLDEN_FRAME), cpi->seg0_progress);
- }
- }
- }
- }
-
- ref_costs[i] = cost;
- }
-}
-
-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
- int mode_index,
- PARTITION_INFO *partition,
- int_mv *ref_mv,
- int_mv *second_ref_mv,
- int single_pred_diff,
- int comp_pred_diff,
- int hybrid_pred_diff,
- int64_t txfm_size_diff[NB_TXFM_MODES]) {
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-
- // Take a snapshot of the coding context so it can be
- // restored if we decide to encode this way
- ctx->best_mode_index = mode_index;
- vpx_memcpy(&ctx->mic, xd->mode_info_context,
- sizeof(MODE_INFO));
- if (partition)
- vpx_memcpy(&ctx->partition_info, partition,
- sizeof(PARTITION_INFO));
- ctx->best_ref_mv.as_int = ref_mv->as_int;
- ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
-
- // ctx[mb_index].rddiv = x->rddiv;
- // ctx[mb_index].rdmult = x->rdmult;
-
- ctx->single_pred_diff = single_pred_diff;
- ctx->comp_pred_diff = comp_pred_diff;
- ctx->hybrid_pred_diff = hybrid_pred_diff;
-
- if (txfm_size_diff) {
- memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
- } else {
- memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
- }
-}
-
-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
- int *rate2, int *distortion2, int *rate_y,
- int *distortion, int* rate_uv, int *distortion_uv,
- int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
- int y_skippable, uv_skippable;
-
- // Y cost and distortion
- macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
-
- *rate2 += *rate_y;
- *distortion2 += *distortion;
-
- // UV cost and distortion
- if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
- rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
- cpi->common.full_pixel, &uv_skippable);
- else
- rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
- &uv_skippable);
- *rate2 += *rate_uv;
- *distortion2 += *distortion_uv;
- *skippable = y_skippable && uv_skippable;
-}
-
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
- int idx, int frame_type,
- int recon_yoffset, int recon_uvoffset,
- int_mv frame_nearest_mv[4],
- int_mv frame_near_mv[4],
- int_mv frame_best_ref_mv[4],
- int frame_mdcounts[4][4],
- unsigned char *y_buffer[4],
- unsigned char *u_buffer[4],
- unsigned char *v_buffer[4]) {
- YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-
-
- vp9_find_near_mvs(xd, xd->mode_info_context,
- xd->prev_mode_info_context,
- &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],
- &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],
- frame_type, cpi->common.ref_frame_sign_bias);
-
- y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
- u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
- v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
-
-#if CONFIG_NEWBESTREFMV
- vp9_find_mv_refs(xd, xd->mode_info_context,
- xd->prev_mode_info_context,
- frame_type,
- mbmi->ref_mvs[frame_type],
- cpi->common.ref_frame_sign_bias);
-
- vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
- yv12->y_stride,
- mbmi->ref_mvs[frame_type],
- &frame_best_ref_mv[frame_type],
- &frame_nearest_mv[frame_type],
- &frame_near_mv[frame_type]);
-#endif
-}
-
-static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- enum BlockSize block_size,
- int *saddone, int near_sadidx[],
- int mdcounts[4], int64_t txfm_cache[],
- int *rate2, int *distortion, int *skippable,
- int *compmode_cost,
- int *rate_y, int *distortion_y,
- int *rate_uv, int *distortion_uv,
- int *mode_excluded, int *disable_skip,
- int recon_yoffset, int mode_index,
- int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
- int_mv frame_best_ref_mv[4]) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- BLOCK *b = &x->block[0];
- BLOCKD *d = &xd->block[0];
- const int is_comp_pred = (mbmi->second_ref_frame != 0);
- const int num_refs = is_comp_pred ? 2 : 1;
- const int this_mode = mbmi->mode;
- int i;
- int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };
- int_mv cur_mv[2];
- int_mv mvp;
- int64_t this_rd = 0;
-
- switch (this_mode) {
- case NEWMV:
- if (is_comp_pred) {
- if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
- frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
- return INT64_MAX;
- *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
- &frame_best_ref_mv[refs[0]],
- XMVCOST, 96,
- x->e_mbd.allow_high_precision_mv);
- *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
- &frame_best_ref_mv[refs[1]],
- XMVCOST, 96,
- x->e_mbd.allow_high_precision_mv);
- } else {
- int bestsme = INT_MAX;
- int further_steps, step_param = cpi->sf.first_step;
- int sadpb = x->sadperbit16;
- int_mv mvp_full, tmp_mv;
- // search range got from mv_pred(). It uses step_param levels. (0-7)
- int sr = 0;
-
- int tmp_col_min = x->mv_col_min;
- int tmp_col_max = x->mv_col_max;
- int tmp_row_min = x->mv_row_min;
- int tmp_row_max = x->mv_row_max;
-
- vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);
-
- if (!*saddone) {
- cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);
- *saddone = 1;
- }
-
- vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
- mbmi->ref_frame, cpi->common.ref_frame_sign_bias,
- &sr, &near_sadidx[0]);
-
- mvp_full.as_mv.col = mvp.as_mv.col >> 3;
- mvp_full.as_mv.row = mvp.as_mv.row >> 3;
-
- // adjust search range according to sr from mv prediction
- step_param = MAX(step_param, sr);
-
- // Further step/diamond searches as necessary
- further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
- bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
- sadpb, further_steps, 1,
- &cpi->fn_ptr[block_size],
- &frame_best_ref_mv[refs[0]], &tmp_mv);
-
- x->mv_col_min = tmp_col_min;
- x->mv_col_max = tmp_col_max;
- x->mv_row_min = tmp_row_min;
- x->mv_row_max = tmp_row_max;
-
- if (bestsme < INT_MAX) {
- int dis; /* TODO: use dis in distortion calculation later. */
- unsigned int sse;
- cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
- &frame_best_ref_mv[refs[0]],
- x->errorperbit,
- &cpi->fn_ptr[block_size],
- XMVCOST, &dis, &sse);
- }
- d->bmi.as_mv.first.as_int = tmp_mv.as_int;
- frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
-
- // Add the new motion vector cost to our rolling cost variable
- *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],
- XMVCOST, 96, xd->allow_high_precision_mv);
- }
- break;
- case NEARESTMV:
- case NEARMV:
- // Do not bother proceeding if the vector (from newmv, nearest or
- // near) is 0,0 as this should then be coded using the zeromv mode.
- for (i = 0; i < num_refs; ++i)
- if (frame_mv[this_mode][refs[i]].as_int == 0)
- return INT64_MAX;
- case ZEROMV:
- default:
- break;
- }
- for (i = 0; i < num_refs; ++i) {
- cur_mv[i] = frame_mv[this_mode][refs[i]];
- // Clip "next_nearest" so that it does not extend to far out of image
- clamp_mv2(&cur_mv[i], xd);
- if (mv_check_bounds(x, &cur_mv[i]))
- return INT64_MAX;
- mbmi->mv[i].as_int = cur_mv[i].as_int;
- }
-
-#if CONFIG_PRED_FILTER
- // Filtered prediction:
- mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;
- *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,
- mbmi->pred_filter_enabled);
-#endif
- if (cpi->common.mcomp_filter_type == SWITCHABLE) {
- const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
- const int m = vp9_switchable_interp_map[mbmi->interp_filter];
- *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
- }
-
- /* We don't include the cost of the second reference here, because there
- * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
- * words if you present them in that order, the second one is always known
- * if the first is known */
- *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),
- is_comp_pred);
- *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);
-
- if (block_size == BLOCK_16X16) {
- vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
- if (is_comp_pred)
- vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
- } else {
-#if CONFIG_SUPERBLOCKS
- vp9_build_inter32x32_predictors_sb(xd,
- xd->dst.y_buffer,
- xd->dst.u_buffer,
- xd->dst.v_buffer,
- xd->dst.y_stride,
- xd->dst.uv_stride);
-#endif
- }
-
- if (cpi->active_map_enabled && x->active_ptr[0] == 0)
- x->skip = 1;
- else if (x->encode_breakout) {
- unsigned int sse, var;
- int threshold = (xd->block[0].dequant[1]
- * xd->block[0].dequant[1] >> 4);
-
- if (threshold < x->encode_breakout)
- threshold = x->encode_breakout;
-
- if (block_size == BLOCK_16X16) {
- var = vp9_variance16x16(*(b->base_src), b->src_stride,
- xd->predictor, 16, &sse);
- } else {
-#if CONFIG_SUPERBLOCKS
- var = vp9_variance32x32(*(b->base_src), b->src_stride,
- xd->dst.y_buffer, xd->dst.y_stride, &sse);
-#endif
- }
-
- if (sse < threshold) {
- unsigned int q2dc = xd->block[24].dequant[0];
- /* If there is no codeable 2nd order dc
- or a very small uniform pixel change change */
- if ((sse - var < q2dc * q2dc >> 4) ||
- (sse / 2 > var && sse - var < 64)) {
- // Check u and v to make sure skip is ok
- int sse2;
-
- if (block_size == BLOCK_16X16) {
- sse2 = vp9_uvsse(x);
- } else {
- unsigned int sse2u, sse2v;
- var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
- xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
- var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
- xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
- sse2 = sse2u + sse2v;
- }
-
- if (sse2 * 2 < threshold) {
- x->skip = 1;
- *distortion = sse + sse2;
- *rate2 = 500;
-
- /* for best_yrd calculation */
- *rate_uv = 0;
- *distortion_uv = sse2;
-
- *disable_skip = 1;
- this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
- }
- }
- }
- }
-
- if (!x->skip) {
- if (block_size == BLOCK_16X16) {
- vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
- &xd->predictor[320], 8);
- if (is_comp_pred)
- vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
- &xd->predictor[320], 8);
- inter_mode_cost(cpi, x, this_mode, rate2, distortion,
- rate_y, distortion_y, rate_uv, distortion_uv,
- skippable, txfm_cache);
- } else {
-#if CONFIG_SUPERBLOCKS
- int skippable_y, skippable_uv;
-
- // Y cost and distortion - FIXME support other transform sizes
- super_block_yrd_8x8(x, rate_y, distortion_y,
- IF_RTCD(&cpi->rtcd), &skippable_y);
- *rate2 += *rate_y;
- *distortion += *distortion_y;
-
- rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
- cm->full_pixel, &skippable_uv);
-
- *rate2 += *rate_uv;
- *distortion += *distortion_uv;
- *skippable = skippable_y && skippable_uv;
-#endif
- }
- }
- if (is_comp_pred) {
- *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
- } else {
- *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
- }
-
- return this_rd; // if 0, this will be re-calculated by caller
-}
-
-void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset, int recon_uvoffset,
- int *returnrate, int *returndistortion,
- int64_t *returnintra) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- union b_mode_info best_bmodes[16];
- MB_MODE_INFO best_mbmode;
- PARTITION_INFO best_partition;
- int_mv best_ref_mv, second_best_ref_mv;
- MB_PREDICTION_MODE this_mode;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
- int i, best_mode_index = 0;
- int mode8x8[2][4];
- unsigned char segment_id = mbmi->segment_id;
-
- int mode_index;
- int mdcounts[4];
- int rate, distortion;
- int rate2, distortion2;
- int64_t best_txfm_rd[NB_TXFM_MODES];
- int64_t best_txfm_diff[NB_TXFM_MODES];
- int64_t best_pred_diff[NB_PREDICTION_TYPES];
- int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_PRED_FILTER
- int64_t best_overall_rd = INT64_MAX;
-#endif
- int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
- int uv_intra_skippable = 0;
- int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
- int uv_intra_skippable_8x8 = 0;
- int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
- int distortion_uv = INT_MAX;
- int64_t best_yrd = INT64_MAX;
-#if CONFIG_PRED_FILTER
- int best_filter_state;
-#endif
- int switchable_filter_index = 0;
-
- MB_PREDICTION_MODE uv_intra_mode;
- MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
-
- int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
- int saddone = 0;
-
- int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
- int_mv frame_best_ref_mv[4];
- int frame_mdcounts[4][4];
- unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
-
- unsigned int ref_costs[MAX_REF_FRAMES];
- int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
- vpx_memset(mode8x8, 0, sizeof(mode8x8));
- vpx_memset(&frame_mv, 0, sizeof(frame_mv));
- vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
- vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
- vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
-
- for (i = 0; i < MAX_REF_FRAMES; i++)
- frame_mv[NEWMV][i].as_int = INVALID_MV;
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
- best_pred_rd[i] = INT64_MAX;
- for (i = 0; i < NB_TXFM_MODES; i++)
- best_txfm_rd[i] = INT64_MAX;
-
- for (i = 0; i < NB_PARTITIONINGS; i++) {
- int j, k;
-
- for (j = 0; j < 16; j++)
- for (k = 0; k < MAX_REF_FRAMES - 1; k++)
- seg_mvs[i][j][k].as_int = INVALID_MV;
- }
-
- if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
- recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
- frame_mv[NEARMV], frame_best_ref_mv,
- frame_mdcounts, y_buffer, u_buffer, v_buffer);
- }
-
- if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
- recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
- frame_mv[NEARMV], frame_best_ref_mv,
- frame_mdcounts, y_buffer, u_buffer, v_buffer);
- }
-
- if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
- setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
- recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
- frame_mv[NEARMV], frame_best_ref_mv,
- frame_mdcounts, y_buffer, u_buffer, v_buffer);
- }
-
- *returnintra = INT64_MAX;
-
- x->skip = 0;
-
- mbmi->ref_frame = INTRA_FRAME;
-
- /* Initialize zbin mode boost for uv costing */
- cpi->zbin_mode_boost = 0;
- vp9_update_zbin_extra(cpi, x);
-
- rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
- &uv_intra_rate_tokenonly, &uv_intra_distortion,
- &uv_intra_skippable);
- uv_intra_mode = mbmi->uv_mode;
-
- /* rough estimate for now */
- if (cpi->common.txfm_mode != ONLY_4X4) {
- rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
- &uv_intra_rate_tokenonly_8x8,
- &uv_intra_distortion_8x8,
- &uv_intra_skippable_8x8);
- uv_intra_mode_8x8 = mbmi->uv_mode;
- }
-
- // Get estimates of reference frame costs for each reference frame
- // that depend on the current prediction etc.
- estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
- for (mode_index = 0; mode_index < MAX_MODES;
- mode_index += (!switchable_filter_index)) {
- int64_t this_rd = INT64_MAX;
- int disable_skip = 0, skippable = 0;
- int other_cost = 0;
- int compmode_cost = 0;
- int mode_excluded = 0;
- int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-
- // These variables hold are rolling total cost and distortion for this mode
- rate2 = 0;
- distortion2 = 0;
- rate_y = 0;
- rate_uv = 0;
-
- this_mode = vp9_mode_order[mode_index].mode;
- mbmi->mode = this_mode;
- mbmi->uv_mode = DC_PRED;
- mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
- mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-#if CONFIG_PRED_FILTER
- mbmi->pred_filter_enabled = 0;
-#endif
- if (cpi->common.mcomp_filter_type == SWITCHABLE &&
- this_mode >= NEARESTMV && this_mode <= SPLITMV) {
- mbmi->interp_filter =
- vp9_switchable_interp[switchable_filter_index++];
- if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
- switchable_filter_index = 0;
- } else {
- mbmi->interp_filter = cpi->common.mcomp_filter_type;
- }
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
- // Test best rd so far against threshold for trying this mode.
- if (best_rd <= cpi->rd_threshes[mode_index])
- continue;
-
- // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
- mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
- // If the segment reference frame feature is enabled....
- // then do nothing if the current ref frame is not allowed..
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
- continue;
- // If the segment mode feature is enabled....
- // then do nothing if the current mode is not allowed..
- } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
- (this_mode !=
- vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
- continue;
- // Disable this drop out case if either the mode or ref frame
- // segment level feature is enabled for this segment. This is to
- // prevent the possibility that the we end up unable to pick any mode.
- } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
- // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
- // unless ARNR filtering is enabled in which case we want
- // an unfiltered alternative
- if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if (this_mode != ZEROMV ||
- mbmi->ref_frame != ALTREF_FRAME) {
- continue;
- }
- }
- }
-
- /* everything but intra */
- if (mbmi->ref_frame) {
- int ref = mbmi->ref_frame;
-
- xd->pre.y_buffer = y_buffer[ref];
- xd->pre.u_buffer = u_buffer[ref];
- xd->pre.v_buffer = v_buffer[ref];
- best_ref_mv = frame_best_ref_mv[ref];
- vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
- }
-
- if (mbmi->second_ref_frame) {
- int ref = mbmi->second_ref_frame;
-
- xd->second_pre.y_buffer = y_buffer[ref];
- xd->second_pre.u_buffer = u_buffer[ref];
- xd->second_pre.v_buffer = v_buffer[ref];
- second_best_ref_mv = frame_best_ref_mv[ref];
- }
-
- // Experimental code. Special case for gf and arf zeromv modes.
- // Increase zbin size to suppress noise
- if (cpi->zbin_mode_boost_enabled) {
- if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
- cpi->zbin_mode_boost = 0;
- else {
- if (vp9_mode_order[mode_index].mode == ZEROMV) {
- if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
- cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
- else
- cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
- } else if (vp9_mode_order[mode_index].mode == SPLITMV)
- cpi->zbin_mode_boost = 0;
- else
- cpi->zbin_mode_boost = MV_ZBIN_BOOST;
- }
-
- vp9_update_zbin_extra(cpi, x);
- }
-
- // Intra
- if (!mbmi->ref_frame) {
- switch (this_mode) {
- default:
- case DC_PRED:
- case V_PRED:
- case H_PRED:
- case TM_PRED:
- case D45_PRED:
- case D135_PRED:
- case D117_PRED:
- case D153_PRED:
- case D27_PRED:
- case D63_PRED:
- mbmi->ref_frame = INTRA_FRAME;
- // FIXME compound intra prediction
- vp9_build_intra_predictors_mby(&x->e_mbd);
- macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
- rate2 += rate_y;
- distortion2 += distortion;
- rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
- if (mbmi->txfm_size != TX_4X4) {
- rate2 += uv_intra_rate_8x8;
- rate_uv = uv_intra_rate_tokenonly_8x8;
- distortion2 += uv_intra_distortion_8x8;
- distortion_uv = uv_intra_distortion_8x8;
- skippable = skippable && uv_intra_skippable_8x8;
- } else {
- rate2 += uv_intra_rate;
- rate_uv = uv_intra_rate_tokenonly;
- distortion2 += uv_intra_distortion;
- distortion_uv = uv_intra_distortion;
- skippable = skippable && uv_intra_skippable;
- }
- break;
- case B_PRED: {
- int64_t tmp_rd;
-
- // Note the rate value returned here includes the cost of coding
- // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
- mbmi->txfm_size = TX_4X4;
- tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
-#if CONFIG_COMP_INTRA_PRED
- 0,
-#endif
- 0);
- rate2 += rate;
- distortion2 += distortion;
-
- if (tmp_rd < best_yrd) {
- rate2 += uv_intra_rate;
- rate_uv = uv_intra_rate_tokenonly;
- distortion2 += uv_intra_distortion;
- distortion_uv = uv_intra_distortion;
- } else {
- this_rd = INT64_MAX;
- disable_skip = 1;
- }
- }
- break;
- case I8X8_PRED: {
- int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
- int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
- int64_t tmp_rd_4x4s, tmp_rd_8x8s;
- int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
- int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
- mbmi->txfm_size = TX_4X4;
- tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
- &d4x4, best_yrd);
- mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
- mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
- mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
- mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
- mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
- mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
- mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
- mbmi->txfm_size = TX_8X8;
- tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
- &d8x8, best_yrd);
- txfm_cache[ONLY_4X4] = tmp_rd_4x4;
- txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
- txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
- tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
- tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
- txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
- if (cm->txfm_mode == TX_MODE_SELECT) {
- if (tmp_rd_4x4s < tmp_rd_8x8s) {
- rate = r4x4 + cost0;
- rate_y = tok4x4 + cost0;
- distortion = d4x4;
- mbmi->txfm_size = TX_4X4;
- tmp_rd = tmp_rd_4x4s;
- } else {
- rate = r8x8 + cost1;
- rate_y = tok8x8 + cost1;
- distortion = d8x8;
- mbmi->txfm_size = TX_8X8;
- tmp_rd = tmp_rd_8x8s;
-
- mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
- mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
- mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
- mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
- mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
- mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
- mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
- }
- } else if (cm->txfm_mode == ONLY_4X4) {
- rate = r4x4;
- rate_y = tok4x4;
- distortion = d4x4;
- mbmi->txfm_size = TX_4X4;
- tmp_rd = tmp_rd_4x4;
- } else {
- rate = r8x8;
- rate_y = tok8x8;
- distortion = d8x8;
- mbmi->txfm_size = TX_8X8;
- tmp_rd = tmp_rd_8x8;
-
- mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
- mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
- mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
- mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
- mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
- mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
- mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
- }
-
- rate2 += rate;
- distortion2 += distortion;
-
- /* TODO: uv rate maybe over-estimated here since there is UV intra
- mode coded in I8X8_PRED prediction */
- if (tmp_rd < best_yrd) {
- rate2 += uv_intra_rate;
- rate_uv = uv_intra_rate_tokenonly;
- distortion2 += uv_intra_distortion;
- distortion_uv = uv_intra_distortion;
- } else {
- this_rd = INT64_MAX;
- disable_skip = 1;
- }
- }
- break;
- }
- }
- // Split MV. The code is very different from the other inter modes so
- // special case it.
- else if (this_mode == SPLITMV) {
- const int is_comp_pred = mbmi->second_ref_frame != 0;
- int64_t tmp_rd, this_rd_thresh;
- int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-
- this_rd_thresh =
- (mbmi->ref_frame == LAST_FRAME) ?
- cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
- this_rd_thresh =
- (mbmi->ref_frame == GOLDEN_FRAME) ?
- cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
- second_ref, best_yrd, mdcounts,
- &rate, &rate_y, &distortion,
- &skippable,
- this_rd_thresh, seg_mvs,
- txfm_cache);
- rate2 += rate;
- distortion2 += distortion;
-
- if (cpi->common.mcomp_filter_type == SWITCHABLE)
- rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
- [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
- [vp9_switchable_interp_map[mbmi->interp_filter]];
- // If even the 'Y' rd value of split is higher than best so far
- // then dont bother looking at UV
- if (tmp_rd < best_yrd) {
- int uv_skippable;
-
- rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
- cpi->common.full_pixel);
- rate2 += rate_uv;
- distortion2 += distortion_uv;
- skippable = skippable && uv_skippable;
- } else {
- this_rd = INT64_MAX;
- disable_skip = 1;
- }
-
- if (is_comp_pred)
- mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
- else
- mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-
- compmode_cost =
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
- mbmi->mode = this_mode;
- }
- else {
- this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
- &saddone, near_sadidx, mdcounts, txfm_cache,
- &rate2, &distortion2, &skippable,
- &compmode_cost, &rate_y, &distortion,
- &rate_uv, &distortion_uv,
- &mode_excluded, &disable_skip, recon_yoffset,
- mode_index, frame_mv, frame_best_ref_mv);
- if (this_rd == INT64_MAX)
- continue;
- }
-
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
- rate2 += compmode_cost;
-
- // Estimate the reference frame signaling cost and add it
- // to the rolling cost variable.
- rate2 += ref_costs[mbmi->ref_frame];
-
- if (!disable_skip) {
- // Test for the condition where skip block will be activated
- // because there are no non zero coefficients and make any
- // necessary adjustment for rate. Ignore if skip is coded at
- // segment level as the cost wont have been added in.
- if (cpi->common.mb_no_coeff_skip) {
- int mb_skip_allowed;
-
- // Is Mb level skip allowed for this mb.
- mb_skip_allowed =
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
- if (skippable) {
- mbmi->mb_skip_coeff = 1;
-
- // Back out the coefficient coding costs
- rate2 -= (rate_y + rate_uv);
- // for best_yrd calculation
- rate_uv = 0;
-
- if (mb_skip_allowed) {
- int prob_skip_cost;
-
- // Cost the skip mb case
- vp9_prob skip_prob =
- vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
- if (skip_prob) {
- prob_skip_cost = vp9_cost_bit(skip_prob, 1);
- rate2 += prob_skip_cost;
- other_cost += prob_skip_cost;
- }
- }
- }
- // Add in the cost of the no skip flag.
- else {
- mbmi->mb_skip_coeff = 0;
- if (mb_skip_allowed) {
- int prob_skip_cost = vp9_cost_bit(
- vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
- rate2 += prob_skip_cost;
- other_cost += prob_skip_cost;
- }
- }
- }
-
- // Calculate the final RD estimate for this mode.
- this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
- }
-
- // Keep record of best intra distortion
- if ((mbmi->ref_frame == INTRA_FRAME) &&
- (this_rd < best_intra_rd)) {
- best_intra_rd = this_rd;
- *returnintra = distortion2;
- }
-
- if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
- for (i = 0; i < NB_PREDICTION_TYPES; ++i)
- best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-#if CONFIG_PRED_FILTER
- // Keep track of the best mode irrespective of prediction filter state
- if (this_rd < best_overall_rd) {
- best_overall_rd = this_rd;
- best_filter_state = mbmi->pred_filter_enabled;
- }
-
- // Ignore modes where the prediction filter state doesn't
- // match the state signaled at the frame level
- if ((cm->pred_filter_mode == 2) ||
- (cm->pred_filter_mode ==
- mbmi->pred_filter_enabled)) {
-#endif
- // Did this mode help.. i.e. is it the new best mode
- if (this_rd < best_rd || x->skip) {
- if (!mode_excluded) {
- // Note index of best mode so far
- best_mode_index = mode_index;
-
- if (this_mode <= B_PRED) {
- if (mbmi->txfm_size != TX_4X4
- && this_mode != B_PRED
- && this_mode != I8X8_PRED)
- mbmi->uv_mode = uv_intra_mode_8x8;
- else
- mbmi->uv_mode = uv_intra_mode;
- /* required for left and above block mv */
- mbmi->mv[0].as_int = 0;
- }
-
- other_cost += ref_costs[mbmi->ref_frame];
-
- /* Calculate the final y RD estimate for this mode */
- best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
- (distortion2 - distortion_uv));
-
- *returnrate = rate2;
- *returndistortion = distortion2;
- best_rd = this_rd;
- vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
- vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
- if ((this_mode == B_PRED)
- || (this_mode == I8X8_PRED)
- || (this_mode == SPLITMV))
- for (i = 0; i < 16; i++) {
- best_bmodes[i] = xd->block[i].bmi;
- }
- }
-
- // Testing this mode gave rise to an improvement in best error score.
- // Lower threshold a bit for next time
- cpi->rd_thresh_mult[mode_index] =
- (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
- cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
- cpi->rd_threshes[mode_index] =
- (cpi->rd_baseline_thresh[mode_index] >> 7) *
- cpi->rd_thresh_mult[mode_index];
- }
- // If the mode did not help improve the best error case then raise the
- // threshold for testing that mode next time around.
- else {
- cpi->rd_thresh_mult[mode_index] += 4;
-
- if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
- cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
- }
-
- /* keep record of best compound/single-only prediction */
- if (!disable_skip &&
- mbmi->ref_frame != INTRA_FRAME) {
- int64_t single_rd, hybrid_rd;
- int single_rate, hybrid_rate;
-
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
- single_rate = rate2 - compmode_cost;
- hybrid_rate = rate2;
- } else {
- single_rate = rate2;
- hybrid_rate = rate2 + compmode_cost;
- }
-
- single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
- hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
- if (mbmi->second_ref_frame == INTRA_FRAME &&
- single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
- best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
- } else if (mbmi->second_ref_frame != INTRA_FRAME &&
- single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
- best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
- }
- if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
- best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
- }
-
- /* keep record of best txfm size */
- if (!mode_excluded && this_rd != INT64_MAX) {
- for (i = 0; i < NB_TXFM_MODES; i++) {
- int64_t adj_rd;
- if (this_mode != B_PRED) {
- adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
- } else {
- adj_rd = this_rd;
- }
- if (adj_rd < best_txfm_rd[i])
- best_txfm_rd[i] = adj_rd;
- }
- }
-#if CONFIG_PRED_FILTER
- }
-#endif
-
- if (x->skip && !mode_excluded)
- break;
- }
-
-#if CONFIG_PRED_FILTER
- // Update counts for prediction filter usage
- if (best_filter_state != 0)
- ++cpi->pred_filter_on_count;
- else
- ++cpi->pred_filter_off_count;
-#endif
- if (cpi->common.mcomp_filter_type == SWITCHABLE &&
- best_mbmode.mode >= NEARESTMV &&
- best_mbmode.mode <= SPLITMV) {
- ++cpi->switchable_interp_count
- [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
- [vp9_switchable_interp_map[best_mbmode.interp_filter]];
- }
-
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
- int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
- cpi->rd_thresh_mult[best_mode_index] =
- (cpi->rd_thresh_mult[best_mode_index] >=
- (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
- cpi->rd_threshes[best_mode_index] =
- (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
- cpi->rd_thresh_mult[best_mode_index];
- }
-
- // This code force Altref,0,0 and skip for the frame that overlays a
- // an alrtef unless Altref is filtered. However, this is unsafe if
- // segment level coding of ref frame or mode is enabled for this
- // segment.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
- cpi->is_src_frame_alt_ref &&
- (cpi->oxcf.arnr_max_frames == 0) &&
- (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
- mbmi->mode = ZEROMV;
- if (cm->txfm_mode != TX_MODE_SELECT)
- mbmi->txfm_size = cm->txfm_mode;
- else
- mbmi->txfm_size = TX_16X16;
- mbmi->ref_frame = ALTREF_FRAME;
- mbmi->mv[0].as_int = 0;
- mbmi->uv_mode = DC_PRED;
- mbmi->mb_skip_coeff =
- (cpi->common.mb_no_coeff_skip) ? 1 : 0;
- mbmi->partitioning = 0;
-
- vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
- vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
- goto end;
- }
-
- // macroblock modes
- vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
- if (best_mbmode.mode == B_PRED) {
- for (i = 0; i < 16; i++) {
- xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
- xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
- }
- }
-
- if (best_mbmode.mode == I8X8_PRED)
- set_i8x8_block_modes(x, mode8x8);
-
- if (best_mbmode.mode == SPLITMV) {
- for (i = 0; i < 16; i++)
- xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
- if (mbmi->second_ref_frame)
- for (i = 0; i < 16; i++)
- xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
-
- vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
- mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
- mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
- }
-
- for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
- if (best_pred_rd[i] == INT64_MAX)
- best_pred_diff[i] = INT_MIN;
- else
- best_pred_diff[i] = best_rd - best_pred_rd[i];
- }
-
- if (!x->skip) {
- for (i = 0; i < NB_TXFM_MODES; i++) {
- if (best_txfm_rd[i] == INT64_MAX)
- best_txfm_diff[i] = INT_MIN;
- else
- best_txfm_diff[i] = best_rd - best_txfm_rd[i];
- }
- } else {
- vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
- }
-
-end:
- store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
- &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
- &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
- best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],
- best_txfm_diff);
-}
-
-#if CONFIG_SUPERBLOCKS
-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *returnrate,
- int *returndist) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- int rate_y, rate_uv;
- int rate_y_tokenonly, rate_uv_tokenonly;
- int error_y, error_uv;
- int dist_y, dist_uv;
- int y_skip, uv_skip;
-
- xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
- error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
- &dist_uv, &uv_skip);
- error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
- &dist_y, &y_skip);
-
- if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
- *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
- *returndist = dist_y + (dist_uv >> 2);
- } else {
- *returnrate = rate_y + rate_uv;
- if (cpi->common.mb_no_coeff_skip)
- *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
- *returndist = dist_y + (dist_uv >> 2);
- }
-}
-#endif
-
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
- int *returnrate, int *returndist) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- int64_t error4x4, error16x16;
-#if CONFIG_COMP_INTRA_PRED
- int64_t error4x4d;
- int rate4x4d, dist4x4d;
-#endif
- int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
- int dist4x4, dist16x16, distuv, distuv8x8;
- int rate;
- int rate4x4_tokenonly = 0;
- int rate16x16_tokenonly = 0;
- int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
- int64_t error8x8;
- int rate8x8_tokenonly=0;
- int rate8x8, dist8x8;
- int mode16x16;
- int mode8x8[2][4];
- int dist;
- int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
- int y_intra16x16_skippable;
- int64_t txfm_cache[NB_TXFM_MODES];
- TX_SIZE txfm_size_16x16;
- int i;
-
- mbmi->ref_frame = INTRA_FRAME;
- rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
- &uv_intra_skippable);
- modeuv = mbmi->uv_mode;
- if (cpi->common.txfm_mode != ONLY_4X4) {
- rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
- &distuv8x8, &uv_intra_skippable_8x8);
- modeuv8x8 = mbmi->uv_mode;
- } else {
- uv_intra_skippable_8x8 = uv_intra_skippable;
- rateuv8x8 = rateuv;
- distuv8x8 = distuv;
- rateuv8x8_tokenonly = rateuv_tokenonly;
- modeuv8x8 = modeuv;
- }
-
- // current macroblock under rate-distortion optimization test loop
- error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
- &rate16x16_tokenonly, &dist16x16,
- &y_intra16x16_skippable, txfm_cache);
- mode16x16 = mbmi->mode;
- txfm_size_16x16 = mbmi->txfm_size;
-
- // FIXME(rbultje) support transform-size selection
- mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
- error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
- &dist8x8, error16x16);
- mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
- mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
- mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
- mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
- mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
- mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
- mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
- mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
-
- error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
- &rate4x4, &rate4x4_tokenonly,
- &dist4x4, error16x16,
-#if CONFIG_COMP_INTRA_PRED
- 0,
-#endif
- 0);
-#if CONFIG_COMP_INTRA_PRED
- error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
- &rate4x4d, &rate4x4_tokenonly,
- &dist4x4d, error16x16, 1, 0);
-#endif
-
- mbmi->mb_skip_coeff = 0;
- if (cpi->common.mb_no_coeff_skip &&
- y_intra16x16_skippable && uv_intra_skippable_8x8) {
- mbmi->mb_skip_coeff = 1;
- mbmi->mode = mode16x16;
- mbmi->uv_mode = modeuv;
- rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
- dist = dist16x16 + (distuv8x8 >> 2);
- mbmi->txfm_size = txfm_size_16x16;
- memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
- sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
- } else if (error8x8 > error16x16) {
- if (error4x4 < error16x16) {
- rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
- rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
- if (error4x4d >= error4x4) // FIXME save original modes etc.
- error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
- &rate4x4_tokenonly,
- &dist4x4, error16x16, 0,
- cpi->update_context);
-#else
- rate += rate4x4;
-#endif
- mbmi->mode = B_PRED;
- mbmi->txfm_size = TX_4X4;
- dist = dist4x4 + (distuv >> 2);
- memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
- sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
- } else {
- mbmi->txfm_size = txfm_size_16x16;
- mbmi->mode = mode16x16;
- rate = rate16x16 + rateuv8x8;
- dist = dist16x16 + (distuv8x8 >> 2);
- for (i = 0; i < NB_TXFM_MODES; i++) {
- x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
- }
- }
- if (cpi->common.mb_no_coeff_skip)
- rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
- } else {
- if (error4x4 < error8x8) {
- rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
- rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
- if (error4x4d >= error4x4) // FIXME save original modes etc.
- error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
- &rate4x4_tokenonly,
- &dist4x4, error16x16, 0,
- cpi->update_context);
-#else
- rate += rate4x4;
-#endif
- mbmi->mode = B_PRED;
- mbmi->txfm_size = TX_4X4;
- dist = dist4x4 + (distuv >> 2);
- memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
- sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
- } else {
- // FIXME(rbultje) support transform-size selection
- mbmi->mode = I8X8_PRED;
- mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
- set_i8x8_block_modes(x, mode8x8);
- rate = rate8x8 + rateuv;
- dist = dist8x8 + (distuv >> 2);
- memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
- sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
- }
- if (cpi->common.mb_no_coeff_skip)
- rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
- }
-
- *returnrate = rate;
- *returndist = dist;
-}
-
-#if CONFIG_SUPERBLOCKS
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset, int recon_uvoffset,
- int *returnrate, int *returndistortion) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- MB_PREDICTION_MODE this_mode;
- MV_REFERENCE_FRAME ref_frame;
- unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
- int comp_pred;
- int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
- int_mv frame_best_ref_mv[4];
- int frame_mdcounts[4][4];
- unsigned char *y_buffer[4];
- unsigned char *u_buffer[4];
- unsigned char *v_buffer[4];
- static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
- VP9_ALT_FLAG };
- int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
- cpi->common.alt_fb_idx };
- int mdcounts[4];
- int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
- int saddone = 0;
- int64_t best_rd = INT64_MAX;
- int64_t best_comp_rd = INT64_MAX;
- int64_t best_single_rd = INT64_MAX;
- int64_t best_hybrid_rd = INT64_MAX;
- int64_t best_yrd = INT64_MAX;
- MB_MODE_INFO best_mbmode;
- int mode_index, best_mode_index;
- unsigned int ref_costs[MAX_REF_FRAMES];
-
- x->skip = 0;
- xd->mode_info_context->mbmi.segment_id = segment_id;
- estimate_ref_frame_costs(cpi, segment_id, ref_costs);
- vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
- if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
- recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
- frame_mv[NEARMV], frame_best_ref_mv,
- frame_mdcounts, y_buffer, u_buffer, v_buffer);
- }
- frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
- frame_mv[ZEROMV][ref_frame].as_int = 0;
- }
-
- for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
- int mode_excluded;
- int64_t this_rd = INT64_MAX;
- int disable_skip = 0;
- int other_cost = 0;
- int compmode_cost = 0;
- int rate2 = 0, rate_y = 0, rate_uv = 0;
- int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
- int skippable;
- int64_t txfm_cache[NB_TXFM_MODES];
-
- // Test best rd so far against threshold for trying this mode.
- if (best_rd <= cpi->rd_threshes[mode_index]) {
- continue;
- }
-
- this_mode = vp9_mode_order[mode_index].mode;
- ref_frame = vp9_mode_order[mode_index].ref_frame;
- mbmi->ref_frame = ref_frame;
- comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
- mbmi->mode = this_mode;
- mbmi->uv_mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
- mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
- mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
- if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
- continue;
-
- // not yet supported or not superblocky
- // TODO(rbultje): support intra coding
- if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
- continue;
-
- if (comp_pred) {
- int second_ref;
-
- if (ref_frame == ALTREF_FRAME) {
- second_ref = LAST_FRAME;
- } else {
- second_ref = ref_frame + 1;
- }
- if (!(cpi->ref_frame_flags & flag_list[second_ref]))
- continue;
- mbmi->second_ref_frame = second_ref;
-
- xd->second_pre.y_buffer = y_buffer[second_ref];
- xd->second_pre.u_buffer = u_buffer[second_ref];
- xd->second_pre.v_buffer = v_buffer[second_ref];
- mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
- } else {
- mbmi->second_ref_frame = INTRA_FRAME;
- mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
- }
-
- xd->pre.y_buffer = y_buffer[ref_frame];
- xd->pre.u_buffer = u_buffer[ref_frame];
- xd->pre.v_buffer = v_buffer[ref_frame];
- vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
- // If the segment reference frame feature is enabled....
- // then do nothing if the current ref frame is not allowed..
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_check_segref(xd, segment_id, ref_frame)) {
- continue;
- // If the segment mode feature is enabled....
- // then do nothing if the current mode is not allowed..
- } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
- (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
- continue;
- // Disable this drop out case if either the mode or ref frame
- // segment level feature is enabled for this segment. This is to
- // prevent the possibility that we end up unable to pick any mode.
- } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
- // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
- // unless ARNR filtering is enabled in which case we want
- // an unfiltered alternative
- if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
- continue;
- }
- }
- }
-
- this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
- &saddone, near_sadidx, mdcounts, txfm_cache,
- &rate2, &distortion2, &skippable,
- &compmode_cost, &rate_y, &distortion_y,
- &rate_uv, &distortion_uv,
- &mode_excluded, &disable_skip, recon_yoffset,
- mode_index, frame_mv, frame_best_ref_mv);
- if (this_rd == INT64_MAX)
- continue;
-
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
- rate2 += compmode_cost;
- }
-
- // Estimate the reference frame signaling cost and add it
- // to the rolling cost variable.
- rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
-
- if (!disable_skip) {
- // Test for the condition where skip block will be activated
- // because there are no non zero coefficients and make any
- // necessary adjustment for rate. Ignore if skip is coded at
- // segment level as the cost wont have been added in.
- if (cpi->common.mb_no_coeff_skip) {
- int mb_skip_allowed;
-
- // Is Mb level skip allowed for this mb.
- mb_skip_allowed =
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
- if (skippable) {
- // Back out the coefficient coding costs
- rate2 -= (rate_y + rate_uv);
- // for best_yrd calculation
- rate_uv = 0;
-
- if (mb_skip_allowed) {
- int prob_skip_cost;
-
- // Cost the skip mb case
- vp9_prob skip_prob =
- vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
-
- if (skip_prob) {
- prob_skip_cost = vp9_cost_bit(skip_prob, 1);
- rate2 += prob_skip_cost;
- other_cost += prob_skip_cost;
- }
- }
- }
- // Add in the cost of the no skip flag.
- else if (mb_skip_allowed) {
- int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
- PRED_MBSKIP), 0);
- rate2 += prob_skip_cost;
- other_cost += prob_skip_cost;
- }
- }
-
- // Calculate the final RD estimate for this mode.
- this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
- }
-
-#if 0
- // Keep record of best intra distortion
- if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
- (this_rd < best_intra_rd)) {
- best_intra_rd = this_rd;
- *returnintra = distortion2;
- }
-#endif
-
- if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
- if (this_rd < best_comp_rd)
- best_comp_rd = this_rd;
- if (this_rd < best_single_rd)
- best_single_rd = this_rd;
- if (this_rd < best_hybrid_rd)
- best_hybrid_rd = this_rd;
- }
-
- // Did this mode help.. i.e. is it the new best mode
- if (this_rd < best_rd || x->skip) {
- if (!mode_excluded) {
- // Note index of best mode so far
- best_mode_index = mode_index;
-
-#if 0
- if (this_mode <= B_PRED) {
- xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
- /* required for left and above block mv */
- xd->mode_info_context->mbmi.mv.as_int = 0;
- }
-#endif
-
- other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
-
- /* Calculate the final y RD estimate for this mode */
- best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
- (distortion2 - distortion_uv));
-
- *returnrate = rate2;
- *returndistortion = distortion2;
- best_rd = this_rd;
- vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
- }
-#if 0
- // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
- cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-#endif
- }
- // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
- else {
-#if 0
- cpi->rd_thresh_mult[mode_index] += 4;
-
- if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
- cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
- cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-#endif
- }
-
- /* keep record of best compound/single-only prediction */
- if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
- int single_rd, hybrid_rd, single_rate, hybrid_rate;
-
- if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
- single_rate = rate2 - compmode_cost;
- hybrid_rate = rate2;
- } else {
- single_rate = rate2;
- hybrid_rate = rate2 + compmode_cost;
- }
-
- single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
- hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
- if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
- best_single_rd = single_rd;
- } else if (mbmi->second_ref_frame != INTRA_FRAME &&
- single_rd < best_comp_rd) {
- best_comp_rd = single_rd;
- }
- if (hybrid_rd < best_hybrid_rd) {
- best_hybrid_rd = hybrid_rd;
- }
- }
-
- if (x->skip && !mode_excluded)
- break;
- }
-
- // TODO(rbultje) integrate with RD thresholding
-#if 0
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
- int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
- cpi->rd_thresh_mult[best_mode_index] =
- (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
- cpi->rd_threshes[best_mode_index] =
- (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
- }
-#endif
-
- // This code forces Altref,0,0 and skip for the frame that overlays a
- // an alrtef unless Altref is filtered. However, this is unsafe if
- // segment level coding of ref frame or mode is enabled for this
- // segment.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
- cpi->is_src_frame_alt_ref &&
- (cpi->oxcf.arnr_max_frames == 0) &&
- (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
- mbmi->mode = ZEROMV;
- mbmi->ref_frame = ALTREF_FRAME;
- mbmi->second_ref_frame = 0;
- mbmi->mv[0].as_int = 0;
- mbmi->uv_mode = DC_PRED;
- mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
- mbmi->partitioning = 0;
- mbmi->txfm_size = TX_8X8;
-
- if (best_rd != INT64_MAX)
- store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
- &frame_best_ref_mv[mbmi->ref_frame],
- &frame_best_ref_mv[mbmi->second_ref_frame],
- 0, 0, 0, NULL);
- return best_rd;
- }
-
- // macroblock modes
- vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
- mbmi->txfm_size = TX_8X8;
-
- if (best_rd != INT64_MAX)
- store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
- &frame_best_ref_mv[mbmi->ref_frame],
- &frame_best_ref_mv[mbmi->second_ref_frame],
- (best_single_rd == INT64_MAX) ? INT_MIN :
- (best_rd - best_single_rd),
- (best_comp_rd == INT64_MAX) ? INT_MIN :
- (best_rd - best_comp_rd),
- (best_hybrid_rd == INT64_MAX) ? INT_MIN :
- (best_rd - best_hybrid_rd),
- NULL);
-
- return best_rd;
-}
-#endif
-
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset,
- int recon_uvoffset,
- int *totalrate, int *totaldist) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- int rate, distortion;
- int64_t intra_error = 0;
- unsigned char *segment_id = &mbmi->segment_id;
-
- if (xd->segmentation_enabled)
- x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
- else
- x->encode_breakout = cpi->oxcf.encode_breakout;
-
- // if (cpi->sf.RD)
- // For now this codebase is limited to a single rd encode path
- {
- int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
- vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
- &distortion, &intra_error);
-
- /* restore cpi->zbin_mode_boost_enabled */
- cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
- }
- // else
- // The non rd encode path has been deleted from this code base
- // to simplify development
- // vp9_pick_inter_mode
-
- // Store metrics so they can be added in to totals if this mode is picked
- x->mb_context[xd->mb_index].distortion = distortion;
- x->mb_context[xd->mb_index].intra_error = intra_error;
-
- *totalrate = rate;
- *totaldist = distortion;
-}
--- a/vp8/encoder/rdopt.h
+++ /dev/null
@@ -1,41 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_RDOPT_H
-#define __INC_RDOPT_H
-
-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
-
-extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);
-
-extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- int recon_yoffset, int recon_uvoffset,
- int *returnrate, int *returndistortion,
- int64_t *returnintra);
-
-extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int *d);
-
-extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int *d);
-
-extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,
- const MODE_INFO *here, int_mv *mvp,
- int refframe, int *ref_frame_sign_bias,
- int *sr, int near_sadidx[]);
-
-extern void vp9_init_me_luts();
-
-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
- MB_PREDICTION_MODE mb, int_mv *mv);
-
-#endif
--- a/vp8/encoder/sad_c.c
+++ /dev/null
@@ -1,480 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "vp8/common/sadmxn.h"
-#include "vpx_ports/config.h"
-#include "vpx/vpx_integer.h"
-
-unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-
-unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
-void vp9_sad32x32x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array
- ) {
- sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad32x32x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array
- ) {
- sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad16x16x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array) {
- sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad16x8x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array) {
- sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad8x8x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array) {
- sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad8x16x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array) {
- sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad4x4x3_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr, ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x4x8_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array) {
- sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
-}
-
-void vp9_sad32x32x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array
- ) {
- sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x4x4d_c(const unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr[],
- int ref_stride,
- unsigned int *sad_array) {
- sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr[0], ref_stride, 0x7fffffff);
- sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr[1], ref_stride, 0x7fffffff);
- sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr[2], ref_stride, 0x7fffffff);
- sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
- ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-/* Copy 2 macroblocks to a buffer */
-void vp9_copy32xn_c(unsigned char *src_ptr,
- int src_stride,
- unsigned char *dst_ptr,
- int dst_stride,
- int height) {
- int r;
-
- for (r = 0; r < height; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst_ptr[0] = src_ptr[0];
- dst_ptr[1] = src_ptr[1];
- dst_ptr[2] = src_ptr[2];
- dst_ptr[3] = src_ptr[3];
- dst_ptr[4] = src_ptr[4];
- dst_ptr[5] = src_ptr[5];
- dst_ptr[6] = src_ptr[6];
- dst_ptr[7] = src_ptr[7];
- dst_ptr[8] = src_ptr[8];
- dst_ptr[9] = src_ptr[9];
- dst_ptr[10] = src_ptr[10];
- dst_ptr[11] = src_ptr[11];
- dst_ptr[12] = src_ptr[12];
- dst_ptr[13] = src_ptr[13];
- dst_ptr[14] = src_ptr[14];
- dst_ptr[15] = src_ptr[15];
- dst_ptr[16] = src_ptr[16];
- dst_ptr[17] = src_ptr[17];
- dst_ptr[18] = src_ptr[18];
- dst_ptr[19] = src_ptr[19];
- dst_ptr[20] = src_ptr[20];
- dst_ptr[21] = src_ptr[21];
- dst_ptr[22] = src_ptr[22];
- dst_ptr[23] = src_ptr[23];
- dst_ptr[24] = src_ptr[24];
- dst_ptr[25] = src_ptr[25];
- dst_ptr[26] = src_ptr[26];
- dst_ptr[27] = src_ptr[27];
- dst_ptr[28] = src_ptr[28];
- dst_ptr[29] = src_ptr[29];
- dst_ptr[30] = src_ptr[30];
- dst_ptr[31] = src_ptr[31];
-#else
- ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
- ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
- ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
- ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
- ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
- ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
- ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
- ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
-#endif
- src_ptr += src_stride;
- dst_ptr += dst_stride;
-
- }
-}
--- a/vp8/encoder/satd_c.c
+++ /dev/null
@@ -1,47 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include "vpx_ports/mem.h"
-#include "./vpx_rtcd.h"
-unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *psatd) {
- int r, c, i;
- unsigned int satd = 0;
- DECLARE_ALIGNED(16, short, diff_in[256]);
- DECLARE_ALIGNED(16, short, diff_out[16]);
- short *in;
-
- for (r = 0; r < 16; r++) {
- for (c = 0; c < 16; c++) {
- diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
- }
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- in = diff_in;
- for (r = 0; r < 16; r += 4) {
- for (c = 0; c < 16; c += 4) {
- vp9_short_walsh4x4_c(in + c, diff_out, 32);
- for (i = 0; i < 16; i++)
- satd += abs(diff_out[i]);
- }
- in += 64;
- }
-
- if (psatd)
- *psatd = satd;
-
- return satd;
-}
--- a/vp8/encoder/segmentation.c
+++ /dev/null
@@ -1,327 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "limits.h"
-#include "vpx_mem/vpx_mem.h"
-#include "segmentation.h"
-#include "vp8/common/pred_common.h"
-
-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
- int mb_row, mb_col;
-
- MODE_INFO *this_mb_mode_info = cm->mi;
-
- x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
- if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
- // Reset Gf useage monitors
- vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
- cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
- } else {
- // for each macroblock row in image
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- // for each macroblock col in image
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-
- // If using golden then set GF active flag if not already set.
- // If using last frame 0,0 mode then leave flag as it is
- // else if using non 0,0 motion or intra modes then clear
- // flag if it is currently set
- if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||
- (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {
- if (*(x->gf_active_ptr) == 0) {
- *(x->gf_active_ptr) = 1;
- cpi->gf_active_count++;
- }
- } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&
- *(x->gf_active_ptr)) {
- *(x->gf_active_ptr) = 0;
- cpi->gf_active_count--;
- }
-
- x->gf_active_ptr++; // Step onto next entry
- this_mb_mode_info++; // skip to next mb
-
- }
-
- // this is to account for the border
- this_mb_mode_info++;
- }
- }
-}
-
-void vp9_enable_segmentation(VP9_PTR ptr) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- // Set the appropriate feature bit
- cpi->mb.e_mbd.segmentation_enabled = 1;
- cpi->mb.e_mbd.update_mb_segmentation_map = 1;
- cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-}
-
-void vp9_disable_segmentation(VP9_PTR ptr) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- // Clear the appropriate feature bit
- cpi->mb.e_mbd.segmentation_enabled = 0;
-}
-
-void vp9_set_segmentation_map(VP9_PTR ptr,
- unsigned char *segmentation_map) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- // Copy in the new segmentation map
- vpx_memcpy(cpi->segmentation_map, segmentation_map,
- (cpi->common.mb_rows * cpi->common.mb_cols));
-
- // Signal that the map should be updated.
- cpi->mb.e_mbd.update_mb_segmentation_map = 1;
- cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-}
-
-void vp9_set_segment_data(VP9_PTR ptr,
- signed char *feature_data,
- unsigned char abs_delta) {
- VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
- cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
-
- vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
- sizeof(cpi->mb.e_mbd.segment_feature_data));
-
- // TBD ?? Set the feature mask
- // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
- // sizeof(cpi->mb.e_mbd.segment_feature_mask));
-}
-
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd,
- int *segcounts,
- vp9_prob *segment_tree_probs) {
- int count1, count2;
- int tot_count;
- int i;
-
- // Blank the strtucture to start with
- vpx_memset(segment_tree_probs, 0,
- MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
-
- // Total count for all segments
- count1 = segcounts[0] + segcounts[1];
- count2 = segcounts[2] + segcounts[3];
- tot_count = count1 + count2;
-
- // Work out probabilities of each segment
- if (tot_count)
- segment_tree_probs[0] = (count1 * 255) / tot_count;
- if (count1 > 0)
- segment_tree_probs[1] = (segcounts[0] * 255) / count1;
- if (count2 > 0)
- segment_tree_probs[2] = (segcounts[2] * 255) / count2;
-
- // Clamp probabilities to minimum allowed value
- for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
- if (segment_tree_probs[i] == 0)
- segment_tree_probs[i] = 1;
- }
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd,
- int *segcounts,
- vp9_prob *probs) {
- int cost;
- int count1, count2;
-
- // Cost the top node of the tree
- count1 = segcounts[0] + segcounts[1];
- count2 = segcounts[2] + segcounts[3];
- cost = count1 * vp9_cost_zero(probs[0]) +
- count2 * vp9_cost_one(probs[0]);
-
- // Now add the cost of each individual segment branch
- if (count1 > 0)
- cost += segcounts[0] * vp9_cost_zero(probs[1]) +
- segcounts[1] * vp9_cost_one(probs[1]);
-
- if (count2 > 0)
- cost += segcounts[2] * vp9_cost_zero(probs[2]) +
- segcounts[3] * vp9_cost_one(probs[2]);
-
- return cost;
-
-}
-
-void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
- const int mis = cm->mode_info_stride;
- int i;
- int tot_count;
- int no_pred_cost;
- int t_pred_cost = INT_MAX;
- int pred_context;
-
- int mb_row, mb_col;
- int segmap_index = 0;
- unsigned char segment_id;
-
- int temporal_predictor_count[PREDICTION_PROBS][2];
- int no_pred_segcounts[MAX_MB_SEGMENTS];
- int t_unpred_seg_counts[MAX_MB_SEGMENTS];
-
- vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
- vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
- vp9_prob t_nopred_prob[PREDICTION_PROBS];
-
- // Set default state for the segment tree probabilities and the
- // temporal coding probabilities
- vpx_memset(xd->mb_segment_tree_probs, 255,
- sizeof(xd->mb_segment_tree_probs));
- vpx_memset(cm->segment_pred_probs, 255,
- sizeof(cm->segment_pred_probs));
-
- vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
- vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
- vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
-
- // First of all generate stats regarding how well the last segment map
- // predicts this one
-
- // Initialize macroblock decoder mode info context for the first mb
- // in the frame
- xd->mode_info_context = cm->mi;
-
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
- for (i = 0; i < 4; i++) {
- static const int dx[4] = { +1, -1, +1, +1 };
- static const int dy[4] = { 0, +1, 0, -1 };
- int x_idx = i & 1, y_idx = i >> 1;
-
- if (mb_col + x_idx >= cm->mb_cols ||
- mb_row + y_idx >= cm->mb_rows) {
- goto end;
- }
-
- xd->mb_to_top_edge = -((mb_row * 16) << 3);
- xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
- xd->mb_to_left_edge = -((mb_col * 16) << 3);
- xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;
-
- segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
- segment_id = xd->mode_info_context->mbmi.segment_id;
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- if (mb_col + 1 < cm->mb_cols)
- segment_id = segment_id &&
- xd->mode_info_context[1].mbmi.segment_id;
- if (mb_row + 1 < cm->mb_rows) {
- segment_id = segment_id &&
- xd->mode_info_context[mis].mbmi.segment_id;
- if (mb_col + 1 < cm->mb_cols)
- segment_id = segment_id &&
- xd->mode_info_context[mis + 1].mbmi.segment_id;
- }
- }
-#endif
-
- // Count the number of hits on each segment with no prediction
- no_pred_segcounts[segment_id]++;
-
- // Temporal prediction not allowed on key frames
- if (cm->frame_type != KEY_FRAME) {
- // Test to see if the segment id matches the predicted value.
- int seg_predicted =
- (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
-
- // Get the segment id prediction context
- pred_context =
- vp9_get_pred_context(cm, xd, PRED_SEG_ID);
-
- // Store the prediction status for this mb and update counts
- // as appropriate
- vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
- temporal_predictor_count[pred_context][seg_predicted]++;
-
- if (!seg_predicted)
- // Update the "unpredicted" segment count
- t_unpred_seg_counts[segment_id]++;
- }
-
-#if CONFIG_SUPERBLOCKS
- if (xd->mode_info_context->mbmi.encoded_as_sb) {
- assert(!i);
- xd->mode_info_context += 2;
- break;
- }
-#endif
- end:
- xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
- }
- }
-
- // this is to account for the border in mode_info_context
- xd->mode_info_context -= mb_col;
- xd->mode_info_context += cm->mode_info_stride * 2;
- }
-
- // Work out probability tree for coding segments without prediction
- // and the cost.
- calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);
- no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);
-
- // Key frames cannot use temporal prediction
- if (cm->frame_type != KEY_FRAME) {
- // Work out probability tree for coding those segments not
- // predicted using the temporal method and the cost.
- calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
- t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
-
- // Add in the cost of the signalling for each prediction context
- for (i = 0; i < PREDICTION_PROBS; i++) {
- tot_count = temporal_predictor_count[i][0] +
- temporal_predictor_count[i][1];
-
- // Work out the context probabilities for the segment
- // prediction flag
- if (tot_count) {
- t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /
- tot_count;
-
- // Clamp to minimum allowed value
- if (t_nopred_prob[i] < 1)
- t_nopred_prob[i] = 1;
- } else
- t_nopred_prob[i] = 1;
-
- // Add in the predictor signaling cost
- t_pred_cost += (temporal_predictor_count[i][0] *
- vp9_cost_zero(t_nopred_prob[i])) +
- (temporal_predictor_count[i][1] *
- vp9_cost_one(t_nopred_prob[i]));
- }
- }
-
- // Now choose which coding method to use.
- if (t_pred_cost < no_pred_cost) {
- cm->temporal_update = 1;
- vpx_memcpy(xd->mb_segment_tree_probs,
- t_pred_tree, sizeof(t_pred_tree));
- vpx_memcpy(&cm->segment_pred_probs,
- t_nopred_prob, sizeof(t_nopred_prob));
- } else {
- cm->temporal_update = 0;
- vpx_memcpy(xd->mb_segment_tree_probs,
- no_pred_tree, sizeof(no_pred_tree));
- }
-}
--- a/vp8/encoder/segmentation.h
+++ /dev/null
@@ -1,46 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "string.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-
-#ifndef __INC_SEGMENTATION_H__
-#define __INC_SEGMENTATION_H__ 1
-
-extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,
- MACROBLOCK *x);
-
-extern void vp9_enable_segmentation(VP9_PTR ptr);
-extern void vp9_disable_segmentation(VP9_PTR ptr);
-
-// Valid values for a segment are 0 to 3
-// Segmentation map is arrange as [Rows][Columns]
-extern void vp9_set_segmentation_map(VP9_PTR ptr,
- unsigned char *segmentation_map);
-
-// The values given for each segment can be either deltas (from the default
-// value chosen for the frame) or absolute values.
-//
-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
-// SEGMENT_ALT_LF)
-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
-// SEGMENT_ALT_LF)
-//
-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
-// the absolute values given).
-//
-extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
- unsigned char abs_delta);
-
-extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
-
-#endif /* __INC_SEGMENTATION_H__ */
--- a/vp8/encoder/ssim.c
+++ /dev/null
@@ -1,147 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyx_int.h"
-
-void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
- int rp, unsigned long *sum_s, unsigned long *sum_r,
- unsigned long *sum_sq_s, unsigned long *sum_sq_r,
- unsigned long *sum_sxr) {
- int i, j;
- for (i = 0; i < 16; i++, s += sp, r += rp) {
- for (j = 0; j < 16; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
- }
- }
-}
-void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
- unsigned long *sum_s, unsigned long *sum_r,
- unsigned long *sum_sq_s, unsigned long *sum_sq_r,
- unsigned long *sum_sxr) {
- int i, j;
- for (i = 0; i < 8; i++, s += sp, r += rp) {
- for (j = 0; j < 8; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
- }
- }
-}
-
-const static int64_t cc1 = 26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
-
-static double similarity(unsigned long sum_s, unsigned long sum_r,
- unsigned long sum_sq_s, unsigned long sum_sq_r,
- unsigned long sum_sxr, int count) {
- int64_t ssim_n, ssim_d;
- int64_t c1, c2;
-
- // scale the constants by number of pixels
- c1 = (cc1 * count * count) >> 12;
- c2 = (cc2 * count * count) >> 12;
-
- ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
- (int64_t) 2 * sum_s * sum_r + c2);
-
- ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
- ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
- (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
-
- return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
- unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
- return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
-}
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
- unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
- return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
- int stride_img2, int width, int height) {
- int i, j;
- int samples = 0;
- double ssim_total = 0;
-
- // sample point start with each 4x4 location
- for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j < width - 8; j += 4) {
- double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
- ssim_total += v;
- samples++;
- }
- }
- ssim_total /= samples;
- return ssim_total;
-}
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- int lumamask, double *weight) {
- double a, b, c;
- double ssimv;
-
- a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
-
- b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
-
- c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
-
- ssimv = a * .8 + .1 * (b + c);
-
- *weight = 1;
-
- return ssimv;
-}
-
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- double *ssim_y, double *ssim_u, double *ssim_v) {
- double ssim_all = 0;
- double a, b, c;
-
- a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
-
- b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
-
- c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
- *ssim_y = a;
- *ssim_u = b;
- *ssim_v = c;
- ssim_all = (a * 4 + b + c) / 6;
-
- return ssim_all;
-}
--- a/vp8/encoder/temporal_filter.c
+++ /dev/null
@@ -1,516 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/onyxc_int.h"
-#include "onyx_int.h"
-#include "vp8/common/systemdependent.h"
-#include "quantize.h"
-#include "vp8/common/alloccommon.h"
-#include "mcomp.h"
-#include "firstpass.h"
-#include "psnr.h"
-#include "vpx_scale/vpxscale.h"
-#include "vp8/common/extend.h"
-#include "ratectrl.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vpx_scale/yv12extend.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/common/swapyv12buffer.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include <math.h>
-#include <limits.h>
-
-#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-
-#if VP9_TEMPORAL_ALT_REF
-
-
-static void temporal_filter_predictors_mb_c
-(
- MACROBLOCKD *xd,
- unsigned char *y_mb_ptr,
- unsigned char *u_mb_ptr,
- unsigned char *v_mb_ptr,
- int stride,
- int mv_row,
- int mv_col,
- unsigned char *pred
-) {
- int offset;
- unsigned char *yptr, *uptr, *vptr;
- int omv_row, omv_col;
-
- // Y
- yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
-
- if ((mv_row | mv_col) & 7) {
- xd->subpixel_predict16x16(yptr, stride,
- (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
- } else {
- vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
- }
-
- // U & V
- omv_row = mv_row;
- omv_col = mv_col;
- mv_row >>= 1;
- mv_col >>= 1;
- stride = (stride + 1) >> 1;
- offset = (mv_row >> 3) * stride + (mv_col >> 3);
- uptr = u_mb_ptr + offset;
- vptr = v_mb_ptr + offset;
-
- if ((omv_row | omv_col) & 15) {
- xd->subpixel_predict8x8(uptr, stride,
- (omv_col & 15), (omv_row & 15), &pred[256], 8);
- xd->subpixel_predict8x8(vptr, stride,
- (omv_col & 15), (omv_row & 15), &pred[320], 8);
- }
- else {
- vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
- vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
- }
-}
-void vp9_temporal_filter_apply_c
-(
- unsigned char *frame1,
- unsigned int stride,
- unsigned char *frame2,
- unsigned int block_size,
- int strength,
- int filter_weight,
- unsigned int *accumulator,
- unsigned short *count
-) {
- unsigned int i, j, k;
- int modifier;
- int byte = 0;
-
- for (i = 0, k = 0; i < block_size; i++) {
- for (j = 0; j < block_size; j++, k++) {
-
- int src_byte = frame1[byte];
- int pixel_value = *frame2++;
-
- modifier = src_byte - pixel_value;
- // This is an integer approximation of:
- // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
- // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
- modifier *= modifier;
- modifier *= 3;
- modifier += 1 << (strength - 1);
- modifier >>= strength;
-
- if (modifier > 16)
- modifier = 16;
-
- modifier = 16 - modifier;
- modifier *= filter_weight;
-
- count[k] += modifier;
- accumulator[k] += modifier * pixel_value;
-
- byte++;
- }
-
- byte += stride - block_size;
- }
-}
-
-#if ALT_REF_MC_ENABLED
-
-static int temporal_filter_find_matching_mb_c
-(
- VP9_COMP *cpi,
- YV12_BUFFER_CONFIG *arf_frame,
- YV12_BUFFER_CONFIG *frame_ptr,
- int mb_offset,
- int error_thresh
-) {
- MACROBLOCK *x = &cpi->mb;
- int step_param;
- int further_steps;
- int sadpb = x->sadperbit16;
- int bestsme = INT_MAX;
-
- BLOCK *b = &x->block[0];
- BLOCKD *d = &x->e_mbd.block[0];
- int_mv best_ref_mv1;
- int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
- // Save input state
- unsigned char **base_src = b->base_src;
- int src = b->src;
- int src_stride = b->src_stride;
- unsigned char **base_pre = d->base_pre;
- int pre = d->pre;
- int pre_stride = d->pre_stride;
-
- best_ref_mv1.as_int = 0;
- best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
- best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
-
- // Setup frame pointers
- b->base_src = &arf_frame->y_buffer;
- b->src_stride = arf_frame->y_stride;
- b->src = mb_offset;
-
- d->base_pre = &frame_ptr->y_buffer;
- d->pre_stride = frame_ptr->y_stride;
- d->pre = mb_offset;
-
- // Further step/diamond searches as necessary
- if (cpi->Speed < 8) {
- step_param = cpi->sf.first_step +
- ((cpi->Speed > 5) ? 1 : 0);
- further_steps =
- (cpi->sf.max_step_search_steps - 1) - step_param;
- } else {
- step_param = cpi->sf.first_step + 2;
- further_steps = 0;
- }
-
- /*cpi->sf.search_method == HEX*/
- // TODO Check that the 16x16 vf & sdf are selected here
- // Ignore mv costing by sending NULL pointer instead of cost arrays
- bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
- step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
- NULLMVCOST, NULLMVCOST,
- &best_ref_mv1);
-
-#if ALT_REF_SUBPEL_ENABLED
- // Try sub-pixel MC?
- // if (bestsme > error_thresh && bestsme < INT_MAX)
- {
- int distortion;
- unsigned int sse;
- // Ignore mv costing by sending NULL pointer instead of cost array
- bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
- &best_ref_mv1,
- x->errorperbit,
- &cpi->fn_ptr[BLOCK_16X16],
- NULLMVCOST,
- &distortion, &sse);
- }
-#endif
-
- // Save input state
- b->base_src = base_src;
- b->src = src;
- b->src_stride = src_stride;
- d->base_pre = base_pre;
- d->pre = pre;
- d->pre_stride = pre_stride;
-
- return bestsme;
-}
-#endif
-
-static void temporal_filter_iterate_c
-(
- VP9_COMP *cpi,
- int frame_count,
- int alt_ref_index,
- int strength
-) {
- int byte;
- int frame;
- int mb_col, mb_row;
- unsigned int filter_weight;
- int mb_cols = cpi->common.mb_cols;
- int mb_rows = cpi->common.mb_rows;
- int mb_y_offset = 0;
- int mb_uv_offset = 0;
- DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
- DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);
- MACROBLOCKD *mbd = &cpi->mb.e_mbd;
- YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
- unsigned char *dst1, *dst2;
- DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16 * 16 + 8 * 8 + 8 * 8);
-
- // Save input state
- unsigned char *y_buffer = mbd->pre.y_buffer;
- unsigned char *u_buffer = mbd->pre.u_buffer;
- unsigned char *v_buffer = mbd->pre.v_buffer;
-
- for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-#if ALT_REF_MC_ENABLED
- // Source frames are extended to 16 pixels. This is different than
- // L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
- // A 6/8 tap filter is used for motion search. This requires 2 pixels
- // before and 3 pixels after. So the largest Y mv on a border would
- // then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and
- // therefore only extended by 8. The largest mv that a UV block
- // can support is 8 - INTERP_EXTEND. A UV mv is half of a Y mv.
- // (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.
- // To keep the mv in play for both Y and UV planes the max that it
- // can be on a border is therefore 16 - (2*INTERP_EXTEND+1).
- cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));
- cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
- + (17 - 2 * INTERP_EXTEND);
-#endif
-
- for (mb_col = 0; mb_col < mb_cols; mb_col++) {
- int i, j, k;
- int stride;
-
- vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
- vpx_memset(count, 0, 384 * sizeof(unsigned short));
-
-#if ALT_REF_MC_ENABLED
- cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));
- cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
- + (17 - 2 * INTERP_EXTEND);
-#endif
-
- for (frame = 0; frame < frame_count; frame++) {
- if (cpi->frames[frame] == NULL)
- continue;
-
- mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
- mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
-
- if (frame == alt_ref_index) {
- filter_weight = 2;
- } else {
- int err = 0;
-#if ALT_REF_MC_ENABLED
-#define THRESH_LOW 10000
-#define THRESH_HIGH 20000
-
- // Find best match in this frame by MC
- err = temporal_filter_find_matching_mb_c
- (cpi,
- cpi->frames[alt_ref_index],
- cpi->frames[frame],
- mb_y_offset,
- THRESH_LOW);
-#endif
- // Assign higher weight to matching MB if it's error
- // score is lower. If not applying MC default behavior
- // is to weight all MBs equal.
- filter_weight = err < THRESH_LOW
- ? 2 : err < THRESH_HIGH ? 1 : 0;
- }
-
- if (filter_weight != 0) {
- // Construct the predictors
- temporal_filter_predictors_mb_c
- (mbd,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->u_buffer + mb_uv_offset,
- cpi->frames[frame]->v_buffer + mb_uv_offset,
- cpi->frames[frame]->y_stride,
- mbd->block[0].bmi.as_mv.first.as_mv.row,
- mbd->block[0].bmi.as_mv.first.as_mv.col,
- predictor);
-
- // Apply the filter (YUV)
- TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
- (f->y_buffer + mb_y_offset,
- f->y_stride,
- predictor,
- 16,
- strength,
- filter_weight,
- accumulator,
- count);
-
- TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
- (f->u_buffer + mb_uv_offset,
- f->uv_stride,
- predictor + 256,
- 8,
- strength,
- filter_weight,
- accumulator + 256,
- count + 256);
-
- TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
- (f->v_buffer + mb_uv_offset,
- f->uv_stride,
- predictor + 320,
- 8,
- strength,
- filter_weight,
- accumulator + 320,
- count + 320);
- }
- }
-
- // Normalize filter output to produce AltRef frame
- dst1 = cpi->alt_ref_buffer.y_buffer;
- stride = cpi->alt_ref_buffer.y_stride;
- byte = mb_y_offset;
- for (i = 0, k = 0; i < 16; i++) {
- for (j = 0; j < 16; j++, k++) {
- unsigned int pval = accumulator[k] + (count[k] >> 1);
- pval *= cpi->fixed_divide[count[k]];
- pval >>= 19;
-
- dst1[byte] = (unsigned char)pval;
-
- // move to next pixel
- byte++;
- }
-
- byte += stride - 16;
- }
-
- dst1 = cpi->alt_ref_buffer.u_buffer;
- dst2 = cpi->alt_ref_buffer.v_buffer;
- stride = cpi->alt_ref_buffer.uv_stride;
- byte = mb_uv_offset;
- for (i = 0, k = 256; i < 8; i++) {
- for (j = 0; j < 8; j++, k++) {
- int m = k + 64;
-
- // U
- unsigned int pval = accumulator[k] + (count[k] >> 1);
- pval *= cpi->fixed_divide[count[k]];
- pval >>= 19;
- dst1[byte] = (unsigned char)pval;
-
- // V
- pval = accumulator[m] + (count[m] >> 1);
- pval *= cpi->fixed_divide[count[m]];
- pval >>= 19;
- dst2[byte] = (unsigned char)pval;
-
- // move to next pixel
- byte++;
- }
-
- byte += stride - 8;
- }
-
- mb_y_offset += 16;
- mb_uv_offset += 8;
- }
-
- mb_y_offset += 16 * (f->y_stride - mb_cols);
- mb_uv_offset += 8 * (f->uv_stride - mb_cols);
- }
-
- // Restore input state
- mbd->pre.y_buffer = y_buffer;
- mbd->pre.u_buffer = u_buffer;
- mbd->pre.v_buffer = v_buffer;
-}
-
-void vp9_temporal_filter_prepare_c
-(
- VP9_COMP *cpi,
- int distance
-) {
- int frame = 0;
-
- int num_frames_backward = 0;
- int num_frames_forward = 0;
- int frames_to_blur_backward = 0;
- int frames_to_blur_forward = 0;
- int frames_to_blur = 0;
- int start_frame = 0;
-
- int strength = cpi->oxcf.arnr_strength;
-
- int blur_type = cpi->oxcf.arnr_type;
-
- int max_frames = cpi->active_arnr_frames;
-
- num_frames_backward = distance;
- num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
- - (num_frames_backward + 1);
-
- switch (blur_type) {
- case 1:
- /////////////////////////////////////////
- // Backward Blur
-
- frames_to_blur_backward = num_frames_backward;
-
- if (frames_to_blur_backward >= max_frames)
- frames_to_blur_backward = max_frames - 1;
-
- frames_to_blur = frames_to_blur_backward + 1;
- break;
-
- case 2:
- /////////////////////////////////////////
- // Forward Blur
-
- frames_to_blur_forward = num_frames_forward;
-
- if (frames_to_blur_forward >= max_frames)
- frames_to_blur_forward = max_frames - 1;
-
- frames_to_blur = frames_to_blur_forward + 1;
- break;
-
- case 3:
- default:
- /////////////////////////////////////////
- // Center Blur
- frames_to_blur_forward = num_frames_forward;
- frames_to_blur_backward = num_frames_backward;
-
- if (frames_to_blur_forward > frames_to_blur_backward)
- frames_to_blur_forward = frames_to_blur_backward;
-
- if (frames_to_blur_backward > frames_to_blur_forward)
- frames_to_blur_backward = frames_to_blur_forward;
-
- // When max_frames is even we have 1 more frame backward than forward
- if (frames_to_blur_forward > (max_frames - 1) / 2)
- frames_to_blur_forward = ((max_frames - 1) / 2);
-
- if (frames_to_blur_backward > (max_frames / 2))
- frames_to_blur_backward = (max_frames / 2);
-
- frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
- break;
- }
-
- start_frame = distance + frames_to_blur_forward;
-
-#ifdef DEBUGFWG
- // DEBUG FWG
- printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-, max_frames
-, num_frames_backward
-, num_frames_forward
-, frames_to_blur
-, frames_to_blur_backward
-, frames_to_blur_forward
-, cpi->source_encode_index
-, cpi->last_alt_ref_sei
-, start_frame);
-#endif
-
- // Setup frame pointers, NULL indicates frame not included in filter
- vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
- for (frame = 0; frame < frames_to_blur; frame++) {
- int which_buffer = start_frame - frame;
- struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
- which_buffer);
- cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
- }
-
- temporal_filter_iterate_c(
- cpi,
- frames_to_blur,
- frames_to_blur_backward,
- strength);
-}
-#endif
--- a/vp8/encoder/temporal_filter.h
+++ /dev/null
@@ -1,47 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TEMPORAL_FILTER_H
-#define __INC_TEMPORAL_FILTER_H
-
-#define prototype_apply(sym)\
- void (sym) \
- ( \
- unsigned char *frame1, \
- unsigned int stride, \
- unsigned char *frame2, \
- unsigned int block_size, \
- int strength, \
- int filter_weight, \
- unsigned int *accumulator, \
- unsigned short *count \
- )
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/temporal_filter_x86.h"
-#endif
-
-#ifndef vp9_temporal_filter_apply
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-#endif
-extern prototype_apply(vp9_temporal_filter_apply);
-
-typedef struct {
- prototype_apply(*apply);
-} vp9_temporal_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn
-#endif
-
-#endif // __INC_TEMPORAL_FILTER_H
--- a/vp8/encoder/tokenize.c
+++ /dev/null
@@ -1,868 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "onyx_int.h"
-#include "tokenize.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include "vp8/common/pred_common.h"
-#include "vp8/common/seg_common.h"
-#include "vp8/common/entropy.h"
-
-/* Global event counters used for accumulating statistics across several
- compressions, then generating context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
- [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-#endif /* ENTROPY_STATS */
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
-void vp9_fix_contexts(MACROBLOCKD *xd);
-
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
-
-static void fill_value_tokens() {
-
- TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
- vp9_extra_bit_struct *const e = vp9_extra_bits;
-
- int i = -DCT_MAX_VALUE;
- int sign = 1;
-
- do {
- if (!i)
- sign = 0;
-
- {
- const int a = sign ? -i : i;
- int eb = sign;
-
- if (a > 4) {
- int j = 4;
-
- while (++j < 11 && e[j].base_val <= a) {}
-
- t[i].Token = --j;
- eb |= (a - e[j].base_val) << 1;
- } else
- t[i].Token = a;
-
- t[i].Extra = eb;
- }
-
- // initialize the cost for extra bits for all possible coefficient value.
- {
- int cost = 0;
- vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;
-
- if (p->base_val) {
- const int extra = t[i].Extra;
- const int Length = p->Len;
-
- if (Length)
- cost += treed_cost(p->tree, p->prob, extra >> 1, Length);
-
- cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
- dct_value_cost[i + DCT_MAX_VALUE] = cost;
- }
-
- }
-
- } while (++i < DCT_MAX_VALUE);
-
- vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
- vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-}
-
-static void tokenize_b(VP9_COMP *cpi,
- MACROBLOCKD *xd,
- const BLOCKD * const b,
- TOKENEXTRA **tp,
- PLANE_TYPE type,
- ENTROPY_CONTEXT *a,
- ENTROPY_CONTEXT *l,
- TX_SIZE tx_size,
- int dry_run) {
- int pt; /* near block/prev token context index */
- int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
- const int eob = b->eob; /* one beyond last nonzero coeff */
- TOKENEXTRA *t = *tp; /* store tokens starting here */
- const short *qcoeff_ptr = b->qcoeff;
- int seg_eob;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
- const int *bands, *scan;
- unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
- vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
- const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type(xd, b) : DCT_DCT;
-
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
- switch (tx_size) {
- default:
- case TX_4X4:
- seg_eob = 16;
- bands = vp9_coef_bands;
- scan = vp9_default_zig_zag1d;
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts;
- probs = cpi->common.fc.hybrid_coef_probs;
- if (tx_type == ADST_DCT) {
- scan = vp9_row_scan;
- } else if (tx_type == DCT_ADST) {
- scan = vp9_col_scan;
- }
- } else {
- counts = cpi->coef_counts;
- probs = cpi->common.fc.coef_probs;
- }
- break;
- case TX_8X8:
- if (type == PLANE_TYPE_Y2) {
- seg_eob = 4;
- bands = vp9_coef_bands;
- scan = vp9_default_zig_zag1d;
- } else {
- seg_eob = 64;
- bands = vp9_coef_bands_8x8;
- scan = vp9_default_zig_zag1d_8x8;
- }
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts_8x8;
- probs = cpi->common.fc.hybrid_coef_probs_8x8;
- } else {
- counts = cpi->coef_counts_8x8;
- probs = cpi->common.fc.coef_probs_8x8;
- }
- break;
- case TX_16X16:
- seg_eob = 256;
- bands = vp9_coef_bands_16x16;
- scan = vp9_default_zig_zag1d_16x16;
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts_16x16;
- probs = cpi->common.fc.hybrid_coef_probs_16x16;
- } else {
- counts = cpi->coef_counts_16x16;
- probs = cpi->common.fc.coef_probs_16x16;
- }
- break;
- }
-
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
- seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
-
- do {
- const int band = bands[c];
- int token;
-
- if (c < eob) {
- const int rc = scan[c];
- const int v = qcoeff_ptr[rc];
-
- assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
-
- t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
- token = vp9_dct_value_tokens_ptr[v].Token;
- } else {
- token = DCT_EOB_TOKEN;
- }
-
- t->Token = token;
- t->context_tree = probs[type][band][pt];
- t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
- (band > 1 && type == PLANE_TYPE_Y_NO_DC));
- assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
- if (!dry_run) {
- ++counts[type][band][pt][token];
- }
- pt = vp9_prev_token_class[token];
- ++t;
- } while (c < eob && ++c < seg_eob);
-
- *tp = t;
- *a = *l = (c != !type); /* 0 <-> all coeff data is zero */
-}
-
-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
- int skip = 1;
- int i = 0;
-
- if (has_y2_block) {
- for (i = 0; i < 16; i++)
- skip &= (xd->block[i].eob < 2);
- skip &= (!xd->block[24].eob);
- } else {
- for (i = 0; i < 16; i++)
- skip &= (!xd->block[i].eob);
- }
- return skip;
-}
-
-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
- int skip = 1;
- int i;
-
- for (i = 16; i < 24; i++)
- skip &= (!xd->block[i].eob);
- return skip;
-}
-
-static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
- return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &
- vp9_mbuv_is_skippable_4x4(xd));
-}
-
-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
- int skip = 1;
- int i = 0;
-
- if (has_y2_block) {
- for (i = 0; i < 16; i += 4)
- skip &= (xd->block[i].eob < 2);
- skip &= (!xd->block[24].eob);
- } else {
- for (i = 0; i < 16; i += 4)
- skip &= (!xd->block[i].eob);
- }
- return skip;
-}
-
-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
- return (!xd->block[16].eob) & (!xd->block[20].eob);
-}
-
-static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
- return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
- vp9_mbuv_is_skippable_8x8(xd));
-}
-
-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {
- return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
- vp9_mbuv_is_skippable_4x4(xd));
-}
-
-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
- int skip = 1;
- skip &= !xd->block[0].eob;
- return skip;
-}
-
-static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
- return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
-}
-
-void vp9_tokenize_mb(VP9_COMP *cpi,
- MACROBLOCKD *xd,
- TOKENEXTRA **t,
- int dry_run) {
- PLANE_TYPE plane_type;
- int has_y2_block;
- int b;
- int tx_size = xd->mode_info_context->mbmi.txfm_size;
- int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
- TOKENEXTRA *t_backup = *t;
- ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;
- ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;
-
- // If the MB is going to be skipped because of a segment level flag
- // exclude this from the skip count stats used to calculate the
- // transmitted skip probability;
- int skip_inc;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
-
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
- (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
- skip_inc = 1;
- } else
- skip_inc = 0;
-
- has_y2_block = (tx_size != TX_16X16
- && xd->mode_info_context->mbmi.mode != B_PRED
- && xd->mode_info_context->mbmi.mode != I8X8_PRED
- && xd->mode_info_context->mbmi.mode != SPLITMV);
-
- switch (tx_size) {
- case TX_16X16:
- xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
- break;
- case TX_8X8:
- if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
- xd->mode_info_context->mbmi.mode == SPLITMV)
- xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);
- else
- xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);
- break;
-
- default:
- xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);
- break;
- }
-
- if (xd->mode_info_context->mbmi.mb_skip_coeff) {
- if (!dry_run)
- cpi->skip_true_count[mb_skip_context] += skip_inc;
- if (!cpi->common.mb_no_coeff_skip) {
- vp9_stuff_mb(cpi, xd, t, dry_run);
- } else {
- vp9_fix_contexts(xd);
- }
- if (dry_run)
- *t = t_backup;
- return;
- }
-
- if (!dry_run)
- cpi->skip_false_count[mb_skip_context] += skip_inc;
-
- if (has_y2_block) {
- if (tx_size == TX_8X8) {
- tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
- A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
- TX_8X8, dry_run);
- } else {
- tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
- A + vp9_block2above[24], L + vp9_block2left[24],
- TX_4X4, dry_run);
- }
-
- plane_type = PLANE_TYPE_Y_NO_DC;
- } else
- plane_type = PLANE_TYPE_Y_WITH_DC;
-
- if (tx_size == TX_16X16) {
- tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
- A, L, TX_16X16, dry_run);
- A[1] = A[2] = A[3] = A[0];
- L[1] = L[2] = L[3] = L[0];
-
- for (b = 16; b < 24; b += 4) {
- tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
- A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
- TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
- vpx_memset(&A[8], 0, sizeof(A[8]));
- vpx_memset(&L[8], 0, sizeof(L[8]));
- } else if (tx_size == TX_8X8) {
- for (b = 0; b < 16; b += 4) {
- tokenize_b(cpi, xd, xd->block + b, t, plane_type,
- A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
- TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
- if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
- xd->mode_info_context->mbmi.mode == SPLITMV) {
- for (b = 16; b < 24; b++) {
- tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
- A + vp9_block2above[b], L + vp9_block2left[b],
- TX_4X4, dry_run);
- }
- } else {
- for (b = 16; b < 24; b += 4) {
- tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
- A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
- TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
- }
- } else {
- for (b = 0; b < 16; b++) {
- tokenize_b(cpi, xd, xd->block + b, t, plane_type,
- A + vp9_block2above[b], L + vp9_block2left[b],
- TX_4X4, dry_run);
- }
-
- for (b = 16; b < 24; b++) {
- tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
- A + vp9_block2above[b], L + vp9_block2left[b],
- TX_4X4, dry_run);
- }
- }
- if (dry_run)
- *t = t_backup;
-}
-
-
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
- FILE *f = fopen("context.bin", "rb");
- if (!f) {
- vpx_memset(context_counters, 0, sizeof(context_counters));
- vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
- vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
- } else {
- fread(context_counters, sizeof(context_counters), 1, f);
- fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
- fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
- fclose(f);
- }
-
- f = fopen("treeupdate.bin", "rb");
- if (!f) {
- vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
- vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
- vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
- } else {
- fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
- fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
- fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
- fclose(f);
- }
-}
-
-void print_context_counters() {
- int type, band, pt, t;
- FILE *f = fopen("context.c", "w");
-
- fprintf(f, "#include \"entropy.h\"\n");
- fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
- fprintf(f, "static const unsigned int\n"
- "vp9_default_coef_counts[BLOCK_TYPES]\n"
- " [COEF_BANDS]\n"
- " [PREV_COEF_CONTEXTS]\n"
- " [MAX_ENTROPY_TOKENS]={\n");
-
-# define Comma( X) (X? ",":"")
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- const INT64 x = context_counters [type] [band] [pt] [t];
- const int y = (int) x;
- assert(x == (INT64) y); /* no overflow handling yet */
- fprintf(f, "%s %d", Comma(t), y);
- } while (++t < MAX_ENTROPY_TOKENS);
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES);
- fprintf(f, "\n};\n");
-
- fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"
- "[BLOCK_TYPES_8X8] [COEF_BANDS]"
- "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- fprintf(f, "%s\n {", Comma(pt));
- t = 0;
- do {
- const INT64 x = context_counters_8x8 [type] [band] [pt] [t];
- const int y = (int) x;
-
- assert(x == (INT64) y); /* no overflow handling yet */
- fprintf(f, "%s %d", Comma(t), y);
-
- } while (++t < MAX_ENTROPY_TOKENS);
-
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
-
- fprintf(f, "\n }");
-
- } while (++band < COEF_BANDS);
-
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES_8X8);
- fprintf(f, "\n};\n");
-
- fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"
- "[BLOCK_TYPES_16X16] [COEF_BANDS]"
- "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- fprintf(f, "%s\n {", Comma(pt));
- t = 0;
- do {
- const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
- const int y = (int) x;
-
- assert(x == (INT64) y); /* no overflow handling yet */
- fprintf(f, "%s %d", Comma(t), y);
-
- } while (++t < MAX_ENTROPY_TOKENS);
-
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
-
- fprintf(f, "\n }");
-
- } while (++band < COEF_BANDS);
-
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES_16X16);
- fprintf(f, "\n};\n");
-
- fprintf(f, "static const vp9_prob\n"
- "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
- "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- unsigned int branch_ct [ENTROPY_NODES] [2];
- unsigned int coef_counts[MAX_ENTROPY_TOKENS];
- vp9_prob coef_probs[ENTROPY_NODES];
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- coef_counts[t] = context_counters [type] [band] [pt] [t];
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, coef_counts, 256, 1);
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-
- } while (++t < ENTROPY_NODES);
-
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES);
- fprintf(f, "\n};\n");
-
- fprintf(f, "static const vp9_prob\n"
- "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
- "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- unsigned int branch_ct [ENTROPY_NODES] [2];
- unsigned int coef_counts[MAX_ENTROPY_TOKENS];
- vp9_prob coef_probs[ENTROPY_NODES];
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, coef_counts, 256, 1);
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- fprintf(f, "%s %d", Comma(t), coef_probs[t]);
- } while (++t < ENTROPY_NODES);
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES_8X8);
- fprintf(f, "\n};\n");
-
- fprintf(f, "static const vp9_prob\n"
- "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
- "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
- type = 0;
- do {
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- unsigned int branch_ct [ENTROPY_NODES] [2];
- unsigned int coef_counts[MAX_ENTROPY_TOKENS];
- vp9_prob coef_probs[ENTROPY_NODES];
- for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
- coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
- vp9_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
- coef_probs, branch_ct, coef_counts, 256, 1);
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- fprintf(f, "%s %d", Comma(t), coef_probs[t]);
- } while (++t < ENTROPY_NODES);
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++type < BLOCK_TYPES_16X16);
- fprintf(f, "\n};\n");
-
- fclose(f);
-
- f = fopen("context.bin", "wb");
- fwrite(context_counters, sizeof(context_counters), 1, f);
- fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
- fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
- fclose(f);
-}
-#endif
-
-void vp9_tokenize_initialize() {
- fill_value_tokens();
-}
-
-static __inline void stuff_b(VP9_COMP *cpi,
- MACROBLOCKD *xd,
- const BLOCKD * const b,
- TOKENEXTRA **tp,
- PLANE_TYPE type,
- ENTROPY_CONTEXT *a,
- ENTROPY_CONTEXT *l,
- TX_SIZE tx_size,
- int dry_run) {
- const int *bands;
- unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
- vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
- int pt, band;
- TOKENEXTRA *t = *tp;
- const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type(xd, b) : DCT_DCT;
- VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-
- switch (tx_size) {
- default:
- case TX_4X4:
- bands = vp9_coef_bands;
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts;
- probs = cpi->common.fc.hybrid_coef_probs;
- } else {
- counts = cpi->coef_counts;
- probs = cpi->common.fc.coef_probs;
- }
- break;
- case TX_8X8:
- bands = vp9_coef_bands_8x8;
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts_8x8;
- probs = cpi->common.fc.hybrid_coef_probs_8x8;
- } else {
- counts = cpi->coef_counts_8x8;
- probs = cpi->common.fc.coef_probs_8x8;
- }
- break;
- case TX_16X16:
- bands = vp9_coef_bands_16x16;
- if (tx_type != DCT_DCT) {
- counts = cpi->hybrid_coef_counts_16x16;
- probs = cpi->common.fc.hybrid_coef_probs_16x16;
- } else {
- counts = cpi->coef_counts_16x16;
- probs = cpi->common.fc.coef_probs_16x16;
- }
- break;
- }
- band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
- t->Token = DCT_EOB_TOKEN;
- t->context_tree = probs[type][band][pt];
- t->skip_eob_node = 0;
- ++t;
- *tp = t;
- *a = *l = 0;
- if (!dry_run) {
- ++counts[type][band][pt][DCT_EOB_TOKEN];
- }
-}
-
-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
- TOKENEXTRA **t, int dry_run) {
- ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
- PLANE_TYPE plane_type;
- int b;
- const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV);
-
- if (has_y2_block) {
- stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
- A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
- TX_8X8, dry_run);
- plane_type = PLANE_TYPE_Y_NO_DC;
- } else {
- plane_type = PLANE_TYPE_Y_WITH_DC;
- }
-
- for (b = 0; b < 16; b += 4) {
- stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],
- L + vp9_block2left_8x8[b], TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
-
- for (b = 16; b < 24; b += 4) {
- stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
- A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
- TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
-}
-
-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
- TOKENEXTRA **t, int dry_run) {
- ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
- int b;
-
- stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);
- A[1] = A[2] = A[3] = A[0];
- L[1] = L[2] = L[3] = L[0];
- for (b = 16; b < 24; b += 4) {
- stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
- L + vp9_block2above_8x8[b], TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
- vpx_memset(&A[8], 0, sizeof(A[8]));
- vpx_memset(&L[8], 0, sizeof(L[8]));
-}
-
-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
- TOKENEXTRA **t, int dry_run) {
- ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
- int b;
- PLANE_TYPE plane_type;
- const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
- xd->mode_info_context->mbmi.mode != I8X8_PRED &&
- xd->mode_info_context->mbmi.mode != SPLITMV);
-
- if (has_y2_block) {
- stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],
- L + vp9_block2left[24], TX_4X4, dry_run);
- plane_type = PLANE_TYPE_Y_NO_DC;
- } else {
- plane_type = PLANE_TYPE_Y_WITH_DC;
- }
-
- for (b = 0; b < 16; b++)
- stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],
- L + vp9_block2left[b], TX_4X4, dry_run);
-
- for (b = 16; b < 24; b++)
- stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
- L + vp9_block2left[b], TX_4X4, dry_run);
-}
-
-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
- TOKENEXTRA **t, int dry_run) {
- ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
- ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
- int b;
-
- for (b = 0; b < 16; b += 4) {
- stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,
- A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
- TX_8X8, dry_run);
- A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
- L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
- }
-
- for (b = 16; b < 24; b++)
- stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
- L + vp9_block2left[b], TX_4X4, dry_run);
-}
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
- TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
- TOKENEXTRA * const t_backup = *t;
-
- if (tx_size == TX_16X16) {
- stuff_mb_16x16(cpi, xd, t, dry_run);
- } else if (tx_size == TX_8X8) {
- if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
- xd->mode_info_context->mbmi.mode == SPLITMV) {
- stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
- } else {
- stuff_mb_8x8(cpi, xd, t, dry_run);
- }
- } else {
- stuff_mb_4x4(cpi, xd, t, dry_run);
- }
-
- if (dry_run) {
- *t = t_backup;
- }
-}
-
-void vp9_fix_contexts(MACROBLOCKD *xd) {
- /* Clear entropy contexts for Y2 blocks */
- if ((xd->mode_info_context->mbmi.mode != B_PRED
- && xd->mode_info_context->mbmi.mode != I8X8_PRED
- && xd->mode_info_context->mbmi.mode != SPLITMV)
- || xd->mode_info_context->mbmi.txfm_size == TX_16X16
- ) {
- vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
- } else {
- vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
- vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
- }
-}
--- a/vp8/encoder/tokenize.h
+++ /dev/null
@@ -1,59 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef tokenize_h
-#define tokenize_h
-
-#include "vp8/common/entropy.h"
-#include "block.h"
-
-void vp9_tokenize_initialize();
-
-typedef struct {
- short Token;
- short Extra;
-} TOKENVALUE;
-
-typedef struct {
- const vp9_prob *context_tree;
- short Extra;
- unsigned char Token;
- unsigned char skip_eob_node;
-} TOKENEXTRA;
-
-int rd_cost_mby(MACROBLOCKD *);
-
-extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
-extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
-extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
-
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]
- [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
- [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
- [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-#endif
-
-extern const int *vp9_dct_value_cost_ptr;
-/* TODO: The Token field should be broken out into a separate char array to
- * improve cache locality, since it's needed for costing when the rest of the
- * fields are not.
- */
-extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
-
-#endif /* tokenize_h */
--- a/vp8/encoder/treewriter.c
+++ /dev/null
@@ -1,39 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "treewriter.h"
-
-static void cost(
- int *const C,
- vp9_tree T,
- const vp9_prob *const P,
- int i,
- int c
-) {
- const vp9_prob p = P [i >> 1];
-
- do {
- const vp9_tree_index j = T[i];
- const int d = c + vp9_cost_bit(p, i & 1);
-
- if (j <= 0)
- C[-j] = d;
- else
- cost(C, T, P, j, d);
- } while (++i & 1);
-}
-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
- cost(c, t, p, 0, 0);
-}
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
- cost(c, t, p, 2, 0);
-}
--- a/vp8/encoder/treewriter.h
+++ /dev/null
@@ -1,108 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TREEWRITER_H
-#define __INC_TREEWRITER_H
-
-/* Trees map alphabets into huffman-like codes suitable for an arithmetic
- bit coder. Timothy S Murphy 11 October 2004 */
-
-#include "vp8/common/treecoder.h"
-
-#include "boolhuff.h" /* for now */
-
-typedef BOOL_CODER vp9_writer;
-
-#define vp9_write encode_bool
-#define vp9_write_literal vp9_encode_value
-#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)
-
-/* Approximate length of an encoded bool in 256ths of a bit at given prob */
-
-#define vp9_cost_zero(x) (vp9_prob_cost[x])
-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
-
-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
-
-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
-
-
-/* Both of these return bits, not scaled bits. */
-
-static __inline unsigned int cost_branch(const unsigned int ct[2],
- vp9_prob p) {
- /* Imitate existing calculation */
- return ((ct[0] * vp9_cost_zero(p))
- + (ct[1] * vp9_cost_one(p))) >> 8;
-}
-
-static __inline unsigned int cost_branch256(const unsigned int ct[2],
- vp9_prob p) {
- /* Imitate existing calculation */
- return ((ct[0] * vp9_cost_zero(p))
- + (ct[1] * vp9_cost_one(p)));
-}
-
-/* Small functions to write explicit values and tokens, as well as
- estimate their lengths. */
-
-static __inline void treed_write(vp9_writer *const w,
- vp9_tree t,
- const vp9_prob *const p,
- int v,
- /* number of bits in v, assumed nonzero */
- int n) {
- vp9_tree_index i = 0;
-
- do {
- const int b = (v >> --n) & 1;
- vp9_write(w, b, p[i >> 1]);
- i = t[i + b];
- } while (n);
-}
-
-static __inline void write_token(vp9_writer *const w,
- vp9_tree t,
- const vp9_prob *const p,
- vp9_token *const x) {
- treed_write(w, t, p, x->value, x->Len);
-}
-
-static __inline int treed_cost(vp9_tree t,
- const vp9_prob *const p,
- int v,
- /* number of bits in v, assumed nonzero */
- int n) {
- int c = 0;
- vp9_tree_index i = 0;
-
- do {
- const int b = (v >> --n) & 1;
- c += vp9_cost_bit(p[i >> 1], b);
- i = t[i + b];
- } while (n);
-
- return c;
-}
-
-static __inline int cost_token(vp9_tree t,
- const vp9_prob *const p,
- vp9_token *const x) {
- return treed_cost(t, p, x->value, x->Len);
-}
-
-/* Fill array of costs for all possible token values. */
-
-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
-
-#endif
--- a/vp8/encoder/variance.h
+++ /dev/null
@@ -1,84 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VARIANCE_H
-#define VARIANCE_H
-
-typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int max_sad);
-
-typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- int n);
-
-typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned short *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char * const ref_ptr[],
- int ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- int xoffset,
- int yoffset,
- const unsigned char *ref_ptr,
- int Refstride,
- unsigned int *sse);
-
-typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
- int rp, unsigned long *sum_s,
- unsigned long *sum_r, unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr);
-
-typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride);
-
-typedef struct variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
- vp9_copy32xn_fn_t copymem;
-} vp9_variance_fn_ptr_t;
-
-#endif
--- a/vp8/encoder/variance_c.c
+++ /dev/null
@@ -1,540 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "variance.h"
-#include "vp8/common/filter.h"
-
-
-unsigned int vp9_get_mb_ss_c(const short *src_ptr) {
- unsigned int i, sum = 0;
-
- for (i = 0; i < 256; i++) {
- sum += (src_ptr[i] * src_ptr[i]);
- }
-
- return sum;
-}
-
-
-static void variance(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- int w,
- int h,
- unsigned int *sse,
- int *sum) {
- int i, j;
- int diff;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- diff = src_ptr[j] - ref_ptr[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- src_ptr += source_stride;
- ref_ptr += recon_stride;
- }
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 10));
-}
-#endif
-
-unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 8));
-}
-
-unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 7));
-}
-
-unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 7));
-}
-
-
-unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 6));
-}
-
-unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 4));
-}
-
-
-unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
- *sse = var;
- return var;
-}
-
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_first_pass
- *
- * INPUTS : UINT8 *src_ptr : Pointer to source block.
- * UINT32 src_pixels_per_line : Stride of input block.
- * UINT32 pixel_step : Offset between filter input samples (see notes).
- * UINT32 output_height : Input block height.
- * UINT32 output_width : Input block width.
- * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement first-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply bilinear filter
- output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[pixel_step] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
- src_ptr++;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_second_pass
- *
- * INPUTS : INT32 *src_ptr : Pointer to source block.
- * UINT32 src_pixels_per_line : Stride of input block.
- * UINT32 pixel_step : Offset between filter input samples (see notes).
- * UINT32 output_height : Input block height.
- * UINT32 output_width : Input block width.
- * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement second-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter) {
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply filter
- Temp = ((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[pixel_step] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2);
- output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-
-unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
- unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- // First filter 1d Horizontal
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
- // Now filter Verticaly
- var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
-
- return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
- return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
- return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
- unsigned char temp2[36 * 32];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
-
- return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-#endif
-
-unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
- ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
- ref_ptr, recon_stride, sse);
-}
-#endif
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
- ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
- ref_ptr, recon_stride, sse);
-}
-#endif
-
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
- ref_ptr, recon_stride, sse);
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
- ref_ptr, recon_stride, sse);
-}
-#endif
-
-unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
- xoffset, yoffset, dst_ptr,
- dst_pixels_per_line, sse);
- return *sse;
-}
-
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
- xoffset, yoffset, dst_ptr,
- dst_pixels_per_line, sse);
- return *sse;
-}
-#endif
-
-unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[16 * 9]; // Temp data bufffer used in filtering
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
- return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[9 * 16]; // Temp data bufffer used in filtering
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
- 1, 17, 8, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
- return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#if CONFIG_NEWBESTREFMV
-unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
- const int source_stride,
- const unsigned char *ref_ptr,
- const int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 5));
-}
-
-unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
- const int source_stride,
- const unsigned char *ref_ptr,
- const int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 5));
-}
-
-unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char *src_ptr,
- const int src_pixels_per_line,
- const int xoffset,
- const int yoffset,
- const unsigned char *dst_ptr,
- const int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[16 * 3]; // Temp data bufffer used in filtering
- unsigned char temp2[20 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3,
- src_pixels_per_line, 1, 3, 16, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
-
- return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char *src_ptr,
- const int src_pixels_per_line,
- const int xoffset,
- const int yoffset,
- const unsigned char *dst_ptr,
- const int dst_pixels_per_line,
- unsigned int *sse) {
- unsigned short FData3[2 * 17]; // Temp data bufffer used in filtering
- unsigned char temp2[2 * 16];
- const short *HFilter, *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3,
- src_pixels_per_line, 1, 17, 2, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
-
- return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
-}
-#endif
--- a/vp8/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,241 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx)
-sym(vp9_short_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
-
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
- ; read the input data
- movq mm0, [rsi]
- movq mm1, [rsi + rax]
-
- movq mm2, [rcx]
- movq mm4, [rcx + rax]
-
- ; transpose for the first stage
- movq mm3, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 20 21 22 23
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm3, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm4 ; 20 30 21 31
- punpckhwd mm5, mm4 ; 22 32 23 33
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm3 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm3, mm5 ; 03 13 23 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 3
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
-
- paddw mm0, mm3 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm4, mm2 ; c1 = 1 - 2
- psubw mm5, mm3 ; d1 = 0 - 3
-
- psllw mm5, 3
- psllw mm4, 3
-
- psllw mm0, 3
- psllw mm1, 3
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; op[0] = a1 + b1
- psubw mm2, mm1 ; op[2] = a1 - b1
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm4 ; c1 d1
- punpckhwd mm5, mm4 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_14500)]
- paddd mm4, MMWORD PTR[GLOBAL(_14500)]
- paddd mm3, MMWORD PTR[GLOBAL(_7500)]
- paddd mm5, MMWORD PTR[GLOBAL(_7500)]
-
- psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
- psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
-
- packssdw mm1, mm4 ; op[1]
- packssdw mm3, mm5 ; op[3]
-
- ; done with vertical
- ; transpose for the second stage
- movq mm4, mm0 ; 00 10 20 30
- movq mm5, mm2 ; 02 12 22 32
-
- punpcklwd mm0, mm1 ; 00 01 10 11
- punpckhwd mm4, mm1 ; 20 21 30 31
-
- punpcklwd mm2, mm3 ; 02 03 12 13
- punpckhwd mm5, mm3 ; 22 23 32 33
-
- movq mm1, mm0 ; 00 01 10 11
- punpckldq mm0, mm2 ; 00 01 02 03
-
- punpckhdq mm1, mm2 ; 01 22 12 13
-
- movq mm2, mm4 ; 20 31 30 31
- punpckldq mm2, mm5 ; 20 21 22 23
-
- punpckhdq mm4, mm5 ; 30 31 32 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 4
-
- movq mm5, mm0
- movq mm3, mm1
-
- paddw mm0, mm4 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm3, mm2 ; c1 = 1 - 2
- psubw mm5, mm4 ; d1 = 0 - 3
-
- pxor mm6, mm6 ; zero out for compare
-
- pcmpeqw mm6, mm5 ; d1 != 0
-
- pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
- ; and keep bit 0 of lower
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; a1 + b1
- psubw mm2, mm1 ; a1 - b1
-
- paddw mm0, MMWORD PTR[GLOBAL(_7w)]
- paddw mm2, MMWORD PTR[GLOBAL(_7w)]
-
- psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
- psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
-
- movq MMWORD PTR[rdi + 0 ], mm0
- movq MMWORD PTR[rdi + 16], mm2
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm3 ; c1 d1
- punpckhwd mm5, mm3 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_12000)]
- paddd mm4, MMWORD PTR[GLOBAL(_12000)]
- paddd mm3, MMWORD PTR[GLOBAL(_51000)]
- paddd mm5, MMWORD PTR[GLOBAL(_51000)]
-
- psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
-
- packssdw mm1, mm4 ; op[4]
- packssdw mm3, mm5 ; op[12]
-
- paddw mm1, mm6 ; op[4] += (d1!=0)
-
- movq MMWORD PTR[rdi + 8 ], mm1
- movq MMWORD PTR[rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
- dw 5352
- dw 2217
- dw 5352
- dw 2217
-align 8
-_2217_neg5352:
- dw 2217
- dw -5352
- dw 2217
- dw -5352
-align 8
-_cmp_mask:
- times 4 dw 1
-align 8
-_7w:
- times 4 dw 7
-align 8
-_14500:
- times 2 dd 14500
-align 8
-_7500:
- times 2 dd 7500
-align 8
-_12000:
- times 2 dd 12000
-align 8
-_51000:
- times 2 dd 51000
--- a/vp8/encoder/x86/dct_sse2.asm
+++ /dev/null
@@ -1,432 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE 0
-%if ABI_IS_32BIT
- %define input rsi
- %define output rdi
- %define pitch rax
- push rbp
- mov rbp, rsp
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rdi, arg(1)
-
- movsxd rax, dword ptr arg(2)
- lea rcx, [rsi + rax*2]
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- %define input rcx
- %define output rdx
- %define pitch r8
- SAVE_XMM 7, u
- %else
- %define input rdi
- %define output rsi
- %define pitch rdx
- %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY 0
- %define input
- %define output
- %define pitch
-
-%if ABI_IS_32BIT
- pop rdi
- pop rsi
- RESTORE_GOT
- pop rbp
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_sse2)
-sym(vp9_short_fdct4x4_sse2):
-
- STACK_FRAME_CREATE
-
- movq xmm0, MMWORD PTR[input ] ;03 02 01 00
- movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
- lea input, [input+2*pitch]
- movq xmm1, MMWORD PTR[input ] ;23 22 21 20
- movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
-
- punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
- punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
-
- movdqa xmm2, xmm0
- punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
- punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
- pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
- pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
-
- punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
- movdqa xmm3, xmm0
- paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
- psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
- psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
- psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
-
- movdqa xmm1, xmm0
- pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
- pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
- movdqa xmm4, xmm3
- pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
- pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
-
- paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
- paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
- psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
-
- packssdw xmm0, xmm1 ;op[2] op[0]
- packssdw xmm3, xmm4 ;op[3] op[1]
- ; 23 22 21 20 03 02 01 00
- ;
- ; 33 32 31 30 13 12 11 10
- ;
- movdqa xmm2, xmm0
- punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
- punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
-
- movdqa xmm3, xmm0
- punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
- punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
- punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
-
- movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
- pshufd xmm2, xmm2, 04eh
- movdqa xmm3, xmm0
- paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
- psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
-
- pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
- movdqa xmm2, xmm3 ;save d1 for compare
- pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
- pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
- pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
- pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
- pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
- movdqa xmm1, xmm0
- pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
- pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
-
- pxor xmm4, xmm4 ;zero out for compare
- paddd xmm0, xmm5
- paddd xmm1, xmm5
- pcmpeqw xmm2, xmm4
- psrad xmm0, 4 ;(a1 + b1 + 7)>>4
- psrad xmm1, 4 ;(a1 - b1 + 7)>>4
- pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
- ;and keep bit 0 of lower
-
- movdqa xmm4, xmm3
- pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
- pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
- paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
- paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
- packssdw xmm0, xmm1 ;op[8] op[0]
- psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
- psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
-
- packssdw xmm3, xmm4 ;op[12] op[4]
- movdqa xmm1, xmm0
- paddw xmm3, xmm2 ;op[4] += (d1!=0)
- punpcklqdq xmm0, xmm3 ;op[4] op[0]
- punpckhqdq xmm1, xmm3 ;op[12] op[8]
-
- movdqa XMMWORD PTR[output + 0], xmm0
- movdqa XMMWORD PTR[output + 16], xmm1
-
- STACK_FRAME_DESTROY
-
-;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_fdct8x4_sse2)
-sym(vp9_short_fdct8x4_sse2):
-
- STACK_FRAME_CREATE
-
- ; read the input data
- movdqa xmm0, [input ]
- movdqa xmm2, [input+ pitch]
- lea input, [input+2*pitch]
- movdqa xmm4, [input ]
- movdqa xmm3, [input+ pitch]
-
- ; transpose for the first stage
- movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
- movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
-
- punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
- punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
-
- punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
- punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
-
- movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
- punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
-
- punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
-
- movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
- punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
-
- punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
- movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
-
- punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
- punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
-
- movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
- punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
-
- punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
-
- ; xmm0 0
- ; xmm1 1
- ; xmm2 2
- ; xmm3 3
-
- ; first stage
- movdqa xmm5, xmm0
- movdqa xmm4, xmm1
-
- paddw xmm0, xmm3 ; a1 = 0 + 3
- paddw xmm1, xmm2 ; b1 = 1 + 2
-
- psubw xmm4, xmm2 ; c1 = 1 - 2
- psubw xmm5, xmm3 ; d1 = 0 - 3
-
- psllw xmm5, 3
- psllw xmm4, 3
-
- psllw xmm0, 3
- psllw xmm1, 3
-
- ; output 0 and 2
- movdqa xmm2, xmm0 ; a1
-
- paddw xmm0, xmm1 ; op[0] = a1 + b1
- psubw xmm2, xmm1 ; op[2] = a1 - b1
-
- ; output 1 and 3
- ; interleave c1, d1
- movdqa xmm1, xmm5 ; d1
- punpcklwd xmm1, xmm4 ; c1 d1
- punpckhwd xmm5, xmm4 ; c1 d1
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm5
-
- pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
- paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
- paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
- paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
-
- psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
- psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
-
- packssdw xmm1, xmm4 ; op[1]
- packssdw xmm3, xmm5 ; op[3]
-
- ; done with vertical
- ; transpose for the second stage
- movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
- movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
-
- punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
- punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
-
- punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
- punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
-
- movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
- punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
-
- punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
-
- movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
- punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
-
- punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
- movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
-
- punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
- punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
-
- movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
- punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
-
- punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
-
- ; xmm0 0
- ; xmm1 4
- ; xmm2 1
- ; xmm3 3
-
- movdqa xmm5, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm0, xmm3 ; a1 = 0 + 3
- paddw xmm1, xmm4 ; b1 = 1 + 2
-
- psubw xmm4, xmm2 ; c1 = 1 - 2
- psubw xmm5, xmm3 ; d1 = 0 - 3
-
- pxor xmm6, xmm6 ; zero out for compare
-
- pcmpeqw xmm6, xmm5 ; d1 != 0
-
- pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
- ; and keep bit 0 of lower
-
- ; output 0 and 2
- movdqa xmm2, xmm0 ; a1
-
- paddw xmm0, xmm1 ; a1 + b1
- psubw xmm2, xmm1 ; a1 - b1
-
- paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
- paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
-
- psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
- psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
-
- ; output 1 and 3
- ; interleave c1, d1
- movdqa xmm1, xmm5 ; d1
- punpcklwd xmm1, xmm4 ; c1 d1
- punpckhwd xmm5, xmm4 ; c1 d1
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm5
-
- pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
- paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
- paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
- paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
-
- psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
-
- packssdw xmm1, xmm4 ; op[4]
- packssdw xmm3, xmm5 ; op[12]
-
- paddw xmm1, xmm6 ; op[4] += (d1!=0)
-
- movdqa xmm4, xmm0
- movdqa xmm5, xmm2
-
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm4, xmm1
-
- punpcklqdq xmm2, xmm3
- punpckhqdq xmm5, xmm3
-
- movdqa XMMWORD PTR[output + 0 ], xmm0
- movdqa XMMWORD PTR[output + 16], xmm2
- movdqa XMMWORD PTR[output + 32], xmm4
- movdqa XMMWORD PTR[output + 48], xmm5
-
- STACK_FRAME_DESTROY
-
-SECTION_RODATA
-align 16
-_5352_2217:
- dw 5352
- dw 2217
- dw 5352
- dw 2217
- dw 5352
- dw 2217
- dw 5352
- dw 2217
-align 16
-_2217_neg5352:
- dw 2217
- dw -5352
- dw 2217
- dw -5352
- dw 2217
- dw -5352
- dw 2217
- dw -5352
-align 16
-_mult_add:
- times 8 dw 1
-align 16
-_cmp_mask:
- times 4 dw 1
- times 4 dw 0
-align 16
-_cmp_mask8x4:
- times 8 dw 1
-align 16
-_mult_sub:
- dw 1
- dw -1
- dw 1
- dw -1
- dw 1
- dw -1
- dw 1
- dw -1
-align 16
-_7:
- times 4 dd 7
-align 16
-_7w:
- times 8 dw 7
-align 16
-_14500:
- times 4 dd 14500
-align 16
-_7500:
- times 4 dd 7500
-align 16
-_12000:
- times 4 dd 12000
-align 16
-_51000:
- times 4 dd 51000
--- a/vp8/encoder/x86/encodeopt.asm
+++ /dev/null
@@ -1,386 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_xmm)
-sym(vp9_block_error_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prologue
-
- mov rsi, arg(0) ;coeff_ptr
- mov rdi, arg(1) ;dcoef_ptr
-
- movdqa xmm0, [rsi]
- movdqa xmm1, [rdi]
-
- movdqa xmm2, [rsi+16]
- movdqa xmm3, [rdi+16]
-
- psubw xmm0, xmm1
- psubw xmm2, xmm3
-
- pmaddwd xmm0, xmm0
- pmaddwd xmm2, xmm2
-
- paddd xmm0, xmm2
-
- pxor xmm5, xmm5
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
-
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- psrldq xmm0, 8
- paddd xmm0, xmm1
-
- movq rax, xmm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_mmx)
-sym(vp9_block_error_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- pxor mm7, mm7
-
- mov rdi, arg(1) ;dcoef_ptr
- movq mm3, [rsi]
-
- movq mm4, [rdi]
- movq mm5, [rsi+8]
-
- movq mm6, [rdi+8]
- pxor mm1, mm1 ; from movd mm1, dc ; dc =0
-
- movq mm2, mm7
- psubw mm5, mm6
-
- por mm1, mm2
- pmaddwd mm5, mm5
-
- pcmpeqw mm1, mm7
- psubw mm3, mm4
-
- pand mm1, mm3
- pmaddwd mm1, mm1
-
- paddd mm1, mm5
- movq mm3, [rsi+16]
-
- movq mm4, [rdi+16]
- movq mm5, [rsi+24]
-
- movq mm6, [rdi+24]
- psubw mm5, mm6
-
- pmaddwd mm5, mm5
- psubw mm3, mm4
-
- pmaddwd mm3, mm3
- paddd mm3, mm5
-
- paddd mm1, mm3
- movq mm0, mm1
-
- psrlq mm1, 32
- paddd mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_mmx_impl)
-sym(vp9_mbblock_error_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- pxor mm7, mm7
-
- mov rdi, arg(1) ;dcoef_ptr
- pxor mm2, mm2
-
- movd mm1, dword ptr arg(2) ;dc
- por mm1, mm2
-
- pcmpeqw mm1, mm7
- mov rcx, 16
-
-.mberror_loop_mmx:
- movq mm3, [rsi]
- movq mm4, [rdi]
-
- movq mm5, [rsi+8]
- movq mm6, [rdi+8]
-
-
- psubw mm5, mm6
- pmaddwd mm5, mm5
-
- psubw mm3, mm4
- pand mm3, mm1
-
- pmaddwd mm3, mm3
- paddd mm2, mm5
-
- paddd mm2, mm3
- movq mm3, [rsi+16]
-
- movq mm4, [rdi+16]
- movq mm5, [rsi+24]
-
- movq mm6, [rdi+24]
- psubw mm5, mm6
-
- pmaddwd mm5, mm5
- psubw mm3, mm4
-
- pmaddwd mm3, mm3
- paddd mm2, mm5
-
- paddd mm2, mm3
- add rsi, 32
-
- add rdi, 32
- sub rcx, 1
-
- jnz .mberror_loop_mmx
-
- movq mm0, mm2
- psrlq mm2, 32
-
- paddd mm0, mm2
- movq rax, mm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp9_mbblock_error_xmm_impl)
-sym(vp9_mbblock_error_xmm_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- pxor xmm6, xmm6
-
- mov rdi, arg(1) ;dcoef_ptr
- pxor xmm4, xmm4
-
- movd xmm5, dword ptr arg(2) ;dc
- por xmm5, xmm4
-
- pcmpeqw xmm5, xmm6
- mov rcx, 16
-
-.mberror_loop:
- movdqa xmm0, [rsi]
- movdqa xmm1, [rdi]
-
- movdqa xmm2, [rsi+16]
- movdqa xmm3, [rdi+16]
-
-
- psubw xmm2, xmm3
- pmaddwd xmm2, xmm2
-
- psubw xmm0, xmm1
- pand xmm0, xmm5
-
- pmaddwd xmm0, xmm0
- add rsi, 32
-
- add rdi, 32
-
- sub rcx, 1
- paddd xmm4, xmm2
-
- paddd xmm4, xmm0
- jnz .mberror_loop
-
- movdqa xmm0, xmm4
- punpckldq xmm0, xmm6
-
- punpckhdq xmm4, xmm6
- paddd xmm0, xmm4
-
- movdqa xmm1, xmm0
- psrldq xmm0, 8
-
- paddd xmm0, xmm1
- movq rax, xmm0
-
- pop rdi
- pop rsi
- ; begin epilog
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_mmx_impl)
-sym(vp9_mbuverror_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;s_ptr
- mov rdi, arg(1) ;d_ptr
-
- mov rcx, 16
- pxor mm7, mm7
-
-.mbuverror_loop_mmx:
-
- movq mm1, [rsi]
- movq mm2, [rdi]
-
- psubw mm1, mm2
- pmaddwd mm1, mm1
-
-
- movq mm3, [rsi+8]
- movq mm4, [rdi+8]
-
- psubw mm3, mm4
- pmaddwd mm3, mm3
-
-
- paddd mm7, mm1
- paddd mm7, mm3
-
-
- add rsi, 16
- add rdi, 16
-
- dec rcx
- jnz .mbuverror_loop_mmx
-
- movq mm0, mm7
- psrlq mm7, 32
-
- paddd mm0, mm7
- movq rax, mm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_xmm_impl)
-sym(vp9_mbuverror_xmm_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;s_ptr
- mov rdi, arg(1) ;d_ptr
-
- mov rcx, 16
- pxor xmm3, xmm3
-
-.mbuverror_loop:
-
- movdqa xmm1, [rsi]
- movdqa xmm2, [rdi]
-
- psubw xmm1, xmm2
- pmaddwd xmm1, xmm1
-
- paddd xmm3, xmm1
-
- add rsi, 16
- add rdi, 16
-
- dec rcx
- jnz .mbuverror_loop
-
- pxor xmm0, xmm0
- movdqa xmm1, xmm3
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- paddd xmm1, xmm2
-
- movdqa xmm2, xmm1
-
- psrldq xmm1, 8
- paddd xmm1, xmm2
-
- movq rax, xmm1
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/fwalsh_sse2.asm
+++ /dev/null
@@ -1,164 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2)
-sym(vp9_short_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
- movsxd rdx, dword ptr arg(2) ; pitch
-
- ; first for loop
- movq xmm0, MMWORD PTR [rsi] ; load input
- movq xmm1, MMWORD PTR [rsi + rdx]
- lea rsi, [rsi + rdx*2]
- movq xmm2, MMWORD PTR [rsi]
- movq xmm3, MMWORD PTR [rsi + rdx]
-
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
-
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm2 ; ip[1] ip[0]
- punpckhdq xmm1, xmm2 ; ip[3] ip[2]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- psllw xmm0, 2 ; d1 a1
- psllw xmm2, 2 ; c1 b1
-
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2 ; b1 a1
- punpckhqdq xmm1, xmm2 ; c1 d1
-
- pxor xmm6, xmm6
- movq xmm6, xmm0
- pxor xmm7, xmm7
- pcmpeqw xmm7, xmm6
- paddw xmm7, [GLOBAL(c1)]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1 ; b1+c1 a1+d1
- psubw xmm2, xmm1 ; b1-c1 a1-d1
- paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
-
- ; second for loop
- ; input: 13 9 5 1 12 8 4 0 (xmm0)
- ; 14 10 6 2 15 11 7 3 (xmm2)
- ; after shuffle:
- ; 13 5 9 1 12 4 8 0 (xmm0)
- ; 14 6 10 2 15 7 11 3 (xmm1)
- pshuflw xmm3, xmm0, 0xd8
- pshufhw xmm0, xmm3, 0xd8
- pshuflw xmm3, xmm2, 0xd8
- pshufhw xmm1, xmm3, 0xd8
-
- movdqa xmm2, xmm0
- pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
- pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
- movdqa xmm3, xmm1
- pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
- pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
-
- pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
- pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
- pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
- pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
-
- movdqa xmm0, xmm4
- punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
- punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
- movdqa xmm1, xmm6
- punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
- punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
-
- movdqa xmm2, xmm0
- paddd xmm0, xmm4 ; b21 b20 a21 a20
- psubd xmm2, xmm4 ; c21 c20 d21 d20
- movdqa xmm3, xmm1
- paddd xmm1, xmm6 ; b23 b22 a23 a22
- psubd xmm3, xmm6 ; c23 c22 d23 d22
-
- pxor xmm4, xmm4
- movdqa xmm5, xmm4
- pcmpgtd xmm4, xmm0
- pcmpgtd xmm5, xmm2
- pand xmm4, [GLOBAL(cd1)]
- pand xmm5, [GLOBAL(cd1)]
-
- pxor xmm6, xmm6
- movdqa xmm7, xmm6
- pcmpgtd xmm6, xmm1
- pcmpgtd xmm7, xmm3
- pand xmm6, [GLOBAL(cd1)]
- pand xmm7, [GLOBAL(cd1)]
-
- paddd xmm0, xmm4
- paddd xmm2, xmm5
- paddd xmm0, [GLOBAL(cd3)]
- paddd xmm2, [GLOBAL(cd3)]
- paddd xmm1, xmm6
- paddd xmm3, xmm7
- paddd xmm1, [GLOBAL(cd3)]
- paddd xmm3, [GLOBAL(cd3)]
-
- psrad xmm0, 3
- psrad xmm1, 3
- psrad xmm2, 3
- psrad xmm3, 3
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
- punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
- movdqa xmm5, xmm2
- punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
- punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
-
- packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
- packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-c1:
- dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
- dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
- dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
- dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
--- a/vp8/encoder/x86/mcomp_x86.h
+++ /dev/null
@@ -1,40 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef MCOMP_X86_H
-#define MCOMP_X86_H
-
-#if HAVE_SSE3
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx3
-
-#undef vp9_search_refining_search
-#define vp9_search_refining_search vp9_refining_search_sadx4
-
-#undef vp9_search_diamond_search
-#define vp9_search_diamond_search vp9_diamond_search_sadx4
-
-#endif
-#endif
-
-#if HAVE_SSE4_1
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx8
-
-#endif
-#endif
-
-#endif
-
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx)
-sym(vp9_fast_quantize_b_impl_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movq mm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm1, [rax]
-
- movq mm3, mm0
- psraw mm0, 15
-
- pxor mm3, mm0
- psubw mm3, mm0 ; abs
-
- movq mm2, mm3
- pcmpgtw mm1, mm2
-
- pandn mm1, mm2
- movq mm3, mm1
-
- mov rdx, arg(6) ;quant_ptr
- movq mm1, [rdx]
-
- mov rcx, arg(5) ;round_ptr
- movq mm2, [rcx]
-
- paddw mm3, mm2
- pmulhuw mm3, mm1
-
- pxor mm3, mm0
- psubw mm3, mm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movq mm0, mm3
-
- movq [rdi], mm3
-
- mov rax, arg(3) ;dequant_ptr
- movq mm2, [rax]
-
- pmullw mm3, mm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax], mm3
-
- ; next 8
- movq mm4, [rsi+8]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+8]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+8]
- movq mm6, [rcx+8]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+8], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+8]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+8], mm7
-
-
- ; next 8
- movq mm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+16]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+16]
- movq mm6, [rcx+16]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+16], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+16]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+16], mm7
-
-
- ; next 8
- movq mm4, [rsi+24]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+24]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+24]
- movq mm6, [rcx+24]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+24], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+24]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+24], mm7
-
-
-
- mov rdi, arg(4) ;scan_mask
- mov rsi, arg(2) ;qcoeff_ptr
-
- pxor mm5, mm5
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rsi+8]
-
- movq mm2, [rdi]
- movq mm3, [rdi+8];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- movq mm5, mm0
-
- paddd mm5, mm1
-
- movq mm0, [rsi+16]
- movq mm1, [rsi+24]
-
- movq mm2, [rdi+16]
- movq mm3, [rdi+24];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- paddd mm5, mm0
-
- paddd mm5, mm1
- movq mm0, mm5
-
- psrlq mm5, 32
- paddd mm0, mm5
-
- ; eob adjustment begins here
- movq rcx, mm0
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx ; rdx=-rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
- ; Substitute the sse assembly for the old mmx mixed assembly/C. The
- ; following is kept as reference
- ; movq rcx, mm0
- ; bsr rax, rcx
- ;
- ; mov eob, rax
- ; mov eee, rcx
- ;
- ;if(eee==0)
- ;{
- ; eob=-1;
- ;}
- ;else if(eee<0)
- ;{
- ; eob=15;
- ;}
- ;d->eob = eob+1;
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ /dev/null
@@ -1,380 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse2 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-
-global sym(vp9_regular_quantize_b_sse2)
-sym(vp9_regular_quantize_b_sse2):
- push rbp
- mov rbp, rsp
- SAVE_XMM 7
- GET_GOT rbx
-
-%if ABI_IS_32BIT
- push rdi
- push rsi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- push rdi
- push rsi
- %endif
-%endif
-
- ALIGN_STACK 16, rax
- %define zrun_zbin_boost 0 ; 8
- %define abs_minus_zbin 8 ; 32
- %define temp_qcoeff 40 ; 32
- %define qcoeff 72 ; 32
- %define stack_size 104
- sub rsp, stack_size
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rdx, [rdi + vp9_block_coeff] ; coeff_ptr
- mov rcx, [rdi + vp9_block_zbin] ; zbin_ptr
- movd xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
-
- ; z
- movdqa xmm0, [rdx]
- movdqa xmm4, [rdx + 16]
- mov rdx, [rdi + vp9_block_round] ; round_ptr
-
- pshuflw xmm7, xmm7, 0
- punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
-
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
-
- ; sz
- psraw xmm0, 15
- psraw xmm4, 15
-
- ; (z ^ sz)
- pxor xmm1, xmm0
- pxor xmm5, xmm4
-
- ; x = abs(z)
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- movdqa xmm2, [rcx]
- movdqa xmm3, [rcx + 16]
- mov rcx, [rdi + vp9_block_quant] ; quant_ptr
-
- ; *zbin_ptr + zbin_oq_value
- paddw xmm2, xmm7
- paddw xmm3, xmm7
-
- ; x - (*zbin_ptr + zbin_oq_value)
- psubw xmm1, xmm2
- psubw xmm5, xmm3
- movdqa [rsp + abs_minus_zbin], xmm1
- movdqa [rsp + abs_minus_zbin + 16], xmm5
-
- ; add (zbin_ptr + zbin_oq_value) back
- paddw xmm1, xmm2
- paddw xmm5, xmm3
-
- movdqa xmm2, [rdx]
- movdqa xmm6, [rdx + 16]
-
- movdqa xmm3, [rcx]
- movdqa xmm7, [rcx + 16]
-
- ; x + round
- paddw xmm1, xmm2
- paddw xmm5, xmm6
-
- ; y = x * quant_ptr >> 16
- pmulhw xmm3, xmm1
- pmulhw xmm7, xmm5
-
- ; y += x
- paddw xmm1, xmm3
- paddw xmm5, xmm7
-
- movdqa [rsp + temp_qcoeff], xmm1
- movdqa [rsp + temp_qcoeff + 16], xmm5
-
- pxor xmm6, xmm6
- ; zero qcoeff
- movdqa [rsp + qcoeff], xmm6
- movdqa [rsp + qcoeff + 16], xmm6
-
- mov rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
- mov rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
- mov [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
- ; x
- movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
- ; if (x >= zbin)
- sub cx, WORD PTR[rdx] ; x - zbin
- lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl .rq_zigzag_loop_%1 ; x < zbin
-
- movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
- ; downshift by quant_shift[rc]
- movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
- sar edi, cl ; also sets Z bit
- je .rq_zigzag_loop_%1 ; !y
- mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
- mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP 0
-ZIGZAG_LOOP 1
-ZIGZAG_LOOP 4
-ZIGZAG_LOOP 8
-ZIGZAG_LOOP 5
-ZIGZAG_LOOP 2
-ZIGZAG_LOOP 3
-ZIGZAG_LOOP 6
-ZIGZAG_LOOP 9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP 7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
- movdqa xmm2, [rsp + qcoeff]
- movdqa xmm3, [rsp + qcoeff + 16]
-
- mov rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
- mov rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
-
- ; y ^ sz
- pxor xmm2, xmm0
- pxor xmm3, xmm4
- ; x = (y ^ sz) - sz
- psubw xmm2, xmm0
- psubw xmm3, xmm4
-
- ; dequant
- movdqa xmm0, [rcx]
- movdqa xmm1, [rcx + 16]
-
- mov rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
-
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- movdqa [rcx], xmm2 ; store qcoeff
- movdqa [rcx + 16], xmm3
- movdqa [rdi], xmm0 ; store dqcoeff
- movdqa [rdi + 16], xmm1
-
- ; select the last value (in zig_zag order) for EOB
- pcmpeqw xmm2, xmm6
- pcmpeqw xmm3, xmm6
- ; !
- pcmpeqw xmm6, xmm6
- pxor xmm2, xmm6
- pxor xmm3, xmm6
- ; mask inv_zig_zag
- pand xmm2, [GLOBAL(inv_zig_zag)]
- pand xmm3, [GLOBAL(inv_zig_zag + 16)]
- ; select the max value
- pmaxsw xmm2, xmm3
- pshufd xmm3, xmm2, 00001110b
- pmaxsw xmm2, xmm3
- pshuflw xmm3, xmm2, 00001110b
- pmaxsw xmm2, xmm3
- pshuflw xmm3, xmm2, 00000001b
- pmaxsw xmm2, xmm3
- movd eax, xmm2
- and eax, 0xff
- mov [rsi + vp9_blockd_eob], eax
-
- ; begin epilog
- add rsp, stack_size
- pop rsp
-%if ABI_IS_32BIT
- pop rsi
- pop rdi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- pop rdi
- %endif
-%endif
- RESTORE_GOT
- RESTORE_XMM
- pop rbp
- ret
-
-; void vp9_fast_quantize_b_sse2 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-
-global sym(vp9_fast_quantize_b_sse2)
-sym(vp9_fast_quantize_b_sse2):
- push rbp
- mov rbp, rsp
- GET_GOT rbx
-
-%if ABI_IS_32BIT
- push rdi
- push rsi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- push rdi
- push rsi
- %else
- ; these registers are used for passing arguments
- %endif
-%endif
-
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rax, [rdi + vp9_block_coeff]
- mov rcx, [rdi + vp9_block_round]
- mov rdx, [rdi + vp9_block_quant_fast]
-
- ; z = coeff
- movdqa xmm0, [rax]
- movdqa xmm4, [rax + 16]
-
- ; dup z so we can save sz
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
-
- ; sz = z >> 15
- psraw xmm0, 15
- psraw xmm4, 15
-
- ; x = abs(z) = (z ^ sz) - sz
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- ; x += round
- paddw xmm1, [rcx]
- paddw xmm5, [rcx + 16]
-
- mov rax, [rsi + vp9_blockd_qcoeff]
- mov rcx, [rsi + vp9_blockd_dequant]
- mov rdi, [rsi + vp9_blockd_dqcoeff]
-
- ; y = x * quant >> 16
- pmulhw xmm1, [rdx]
- pmulhw xmm5, [rdx + 16]
-
- ; x = (y ^ sz) - sz
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- ; qcoeff = x
- movdqa [rax], xmm1
- movdqa [rax + 16], xmm5
-
- ; x * dequant
- movdqa xmm2, xmm1
- movdqa xmm3, xmm5
- pmullw xmm2, [rcx]
- pmullw xmm3, [rcx + 16]
-
- ; dqcoeff = x * dequant
- movdqa [rdi], xmm2
- movdqa [rdi + 16], xmm3
-
- pxor xmm4, xmm4 ;clear all bits
- pcmpeqw xmm1, xmm4
- pcmpeqw xmm5, xmm4
-
- pcmpeqw xmm4, xmm4 ;set all bits
- pxor xmm1, xmm4
- pxor xmm5, xmm4
-
- pand xmm1, [GLOBAL(inv_zig_zag)]
- pand xmm5, [GLOBAL(inv_zig_zag + 16)]
-
- pmaxsw xmm1, xmm5
-
- ; now down to 8
- pshufd xmm5, xmm1, 00001110b
-
- pmaxsw xmm1, xmm5
-
- ; only 4 left
- pshuflw xmm5, xmm1, 00001110b
-
- pmaxsw xmm1, xmm5
-
- ; okay, just 2!
- pshuflw xmm5, xmm1, 00000001b
-
- pmaxsw xmm1, xmm5
-
- movd eax, xmm1
- and eax, 0xff
- mov [rsi + vp9_blockd_eob], eax
-
- ; begin epilog
-%if ABI_IS_32BIT
- pop rsi
- pop rdi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- pop rdi
- %endif
-%endif
-
- RESTORE_GOT
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
- dw 0x0001, 0x0002, 0x0006, 0x0007
- dw 0x0003, 0x0005, 0x0008, 0x000d
- dw 0x0004, 0x0009, 0x000c, 0x000e
- dw 0x000a, 0x000b, 0x000f, 0x0010
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ /dev/null
@@ -1,254 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse4 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-
-global sym(vp9_regular_quantize_b_sse4)
-sym(vp9_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
- push rbp
- mov rbp, rsp
- GET_GOT rbx
- push rdi
- push rsi
-
- ALIGN_STACK 16, rax
- %define qcoeff 0 ; 32
- %define stack_size 32
- sub rsp, stack_size
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- SAVE_XMM 8, u
- push rdi
- push rsi
- %endif
-%endif
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rax, [rdi + vp9_block_coeff]
- mov rcx, [rdi + vp9_block_zbin]
- mov rdx, [rdi + vp9_block_round]
- movd xmm7, [rdi + vp9_block_zbin_extra]
-
- ; z
- movdqa xmm0, [rax]
- movdqa xmm1, [rax + 16]
-
- ; duplicate zbin_oq_value
- pshuflw xmm7, xmm7, 0
- punpcklwd xmm7, xmm7
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- ; sz
- psraw xmm0, 15
- psraw xmm1, 15
-
- ; (z ^ sz)
- pxor xmm2, xmm0
- pxor xmm3, xmm1
-
- ; x = abs(z)
- psubw xmm2, xmm0
- psubw xmm3, xmm1
-
- ; zbin
- movdqa xmm4, [rcx]
- movdqa xmm5, [rcx + 16]
-
- ; *zbin_ptr + zbin_oq_value
- paddw xmm4, xmm7
- paddw xmm5, xmm7
-
- movdqa xmm6, xmm2
- movdqa xmm7, xmm3
-
- ; x - (*zbin_ptr + zbin_oq_value)
- psubw xmm6, xmm4
- psubw xmm7, xmm5
-
- ; round
- movdqa xmm4, [rdx]
- movdqa xmm5, [rdx + 16]
-
- mov rax, [rdi + vp9_block_quant_shift]
- mov rcx, [rdi + vp9_block_quant]
- mov rdx, [rdi + vp9_block_zrun_zbin_boost]
-
- ; x + round
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
- ; quant
- movdqa xmm4, [rcx]
- movdqa xmm5, [rcx + 16]
-
- ; y = x * quant_ptr >> 16
- pmulhw xmm4, xmm2
- pmulhw xmm5, xmm3
-
- ; y += x
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
- pxor xmm4, xmm4
-%if ABI_IS_32BIT
- movdqa [rsp + qcoeff], xmm4
- movdqa [rsp + qcoeff + 16], xmm4
-%else
- pxor xmm8, xmm8
-%endif
-
- ; quant_shift
- movdqa xmm5, [rax]
-
- ; zrun_zbin_boost
- mov rax, rdx
-
-%macro ZIGZAG_LOOP 5
- ; x
- pextrw ecx, %4, %2
-
- ; if (x >= zbin)
- sub cx, WORD PTR[rdx] ; x - zbin
- lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl .rq_zigzag_loop_%1 ; x < zbin
-
- pextrw edi, %3, %2 ; y
-
- ; downshift by quant_shift[rc]
- pextrb ecx, xmm5, %1 ; quant_shift[rc]
- sar edi, cl ; also sets Z bit
- je .rq_zigzag_loop_%1 ; !y
-%if ABI_IS_32BIT
- mov WORD PTR[rsp + qcoeff + %1 *2], di
-%else
- pinsrw %5, edi, %2 ; qcoeff[rc]
-%endif
- mov rdx, rax ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
- mov rcx, [rsi + vp9_blockd_dequant]
- mov rdi, [rsi + vp9_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
- movdqa xmm4, [rsp + qcoeff]
- movdqa xmm5, [rsp + qcoeff + 16]
-%else
- %define xmm5 xmm8
-%endif
-
- ; y ^ sz
- pxor xmm4, xmm0
- pxor xmm5, xmm1
- ; x = (y ^ sz) - sz
- psubw xmm4, xmm0
- psubw xmm5, xmm1
-
- ; dequant
- movdqa xmm0, [rcx]
- movdqa xmm1, [rcx + 16]
-
- mov rcx, [rsi + vp9_blockd_qcoeff]
-
- pmullw xmm0, xmm4
- pmullw xmm1, xmm5
-
- ; store qcoeff
- movdqa [rcx], xmm4
- movdqa [rcx + 16], xmm5
-
- ; store dqcoeff
- movdqa [rdi], xmm0
- movdqa [rdi + 16], xmm1
-
- ; select the last value (in zig_zag order) for EOB
- pxor xmm6, xmm6
- pcmpeqw xmm4, xmm6
- pcmpeqw xmm5, xmm6
-
- packsswb xmm4, xmm5
- pshufb xmm4, [GLOBAL(zig_zag1d)]
- pmovmskb edx, xmm4
- xor rdi, rdi
- mov eax, -1
- xor dx, ax
- bsr eax, edx
- sub edi, edx
- sar edi, 31
- add eax, 1
- and eax, edi
-
- mov [rsi + vp9_blockd_eob], eax
-
- ; begin epilog
-%if ABI_IS_32BIT
- add rsp, stack_size
- pop rsp
-
- pop rsi
- pop rdi
- RESTORE_GOT
- pop rbp
-%else
- %undef xmm5
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- pop rdi
- RESTORE_XMM
- %endif
-%endif
-
- ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp9_default_zig_zag1d
-zig_zag1d:
- db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "asm_enc_offsets.asm"
-
-
-; void vp9_fast_quantize_b_ssse3 | arg
-; (BLOCK *b, | 0
-; BLOCKD *d) | 1
-;
-
-global sym(vp9_fast_quantize_b_ssse3)
-sym(vp9_fast_quantize_b_ssse3):
- push rbp
- mov rbp, rsp
- GET_GOT rbx
-
-%if ABI_IS_32BIT
- push rdi
- push rsi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- push rdi
- push rsi
- %endif
-%endif
- ; end prolog
-
-%if ABI_IS_32BIT
- mov rdi, arg(0) ; BLOCK *b
- mov rsi, arg(1) ; BLOCKD *d
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- mov rdi, rcx ; BLOCK *b
- mov rsi, rdx ; BLOCKD *d
- %else
- ;mov rdi, rdi ; BLOCK *b
- ;mov rsi, rsi ; BLOCKD *d
- %endif
-%endif
-
- mov rax, [rdi + vp9_block_coeff]
- mov rcx, [rdi + vp9_block_round]
- mov rdx, [rdi + vp9_block_quant_fast]
-
- ; coeff
- movdqa xmm0, [rax]
- movdqa xmm4, [rax + 16]
-
- ; round
- movdqa xmm2, [rcx]
- movdqa xmm3, [rcx + 16]
-
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
-
- ; sz = z >> 15
- psraw xmm0, 15
- psraw xmm4, 15
-
- pabsw xmm1, xmm1
- pabsw xmm5, xmm5
-
- paddw xmm1, xmm2
- paddw xmm5, xmm3
-
- ; quant_fast
- pmulhw xmm1, [rdx]
- pmulhw xmm5, [rdx + 16]
-
- mov rax, [rsi + vp9_blockd_qcoeff]
- mov rdi, [rsi + vp9_blockd_dequant]
- mov rcx, [rsi + vp9_blockd_dqcoeff]
-
- pxor xmm1, xmm0
- pxor xmm5, xmm4
- psubw xmm1, xmm0
- psubw xmm5, xmm4
-
- movdqa [rax], xmm1
- movdqa [rax + 16], xmm5
-
- movdqa xmm2, [rdi]
- movdqa xmm3, [rdi + 16]
-
- pxor xmm4, xmm4
- pmullw xmm2, xmm1
- pmullw xmm3, xmm5
-
- pcmpeqw xmm1, xmm4 ;non zero mask
- pcmpeqw xmm5, xmm4 ;non zero mask
- packsswb xmm1, xmm5
- pshufb xmm1, [GLOBAL(zz_shuf)]
-
- pmovmskb edx, xmm1
-
- xor rdi, rdi
- mov eax, -1
- xor dx, ax ;flip the bits for bsr
- bsr eax, edx
-
- movdqa [rcx], xmm2 ;store dqcoeff
- movdqa [rcx + 16], xmm3 ;store dqcoeff
-
- sub edi, edx ;check for all zeros in bit mask
- sar edi, 31 ;0 or -1
- add eax, 1
- and eax, edi ;if the bit mask was all zero,
- ;then eob = 0
- mov [rsi + vp9_blockd_eob], eax
-
- ; begin epilog
-%if ABI_IS_32BIT
- pop rsi
- pop rdi
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- pop rdi
- %endif
-%endif
-
- RESTORE_GOT
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
- db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp8/encoder/x86/quantize_x86.h
+++ /dev/null
@@ -1,48 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license and patent
- * grant that can be found in the LICENSE file in the root of the source
- * tree. All contributing project authors may be found in the AUTHORS
- * file in the root of the source tree.
- */
-
-#ifndef QUANTIZE_X86_H
-#define QUANTIZE_X86_H
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-
-#endif /* HAVE_MMX */
-
-
-#if HAVE_SSE2
-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE2 */
-
-
-#if HAVE_SSE4_1
-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE4_1 */
-
-#endif /* QUANTIZE_X86_H */
--- a/vp8/encoder/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp9_sad16x16_mmx)
-global sym(vp9_sad8x16_mmx)
-global sym(vp9_sad8x8_mmx)
-global sym(vp9_sad4x4_mmx)
-global sym(vp9_sad16x8_mmx)
-
-;unsigned int vp9_sad16x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpcklbw mm2, mm6
-
- punpckhbw mm1, mm6
- punpckhbw mm3, mm6
-
- paddw mm0, mm2
- paddw mm1, mm3
-
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- paddw mm7, mm1
-
- cmp rsi, rcx
- jne .x16x16sad_mmx_loop
-
-
- movq mm0, mm7
-
- punpcklwd mm0, mm6
- punpckhwd mm7, mm6
-
- paddw mm0, mm7
- movq mm7, mm0
-
-
- psrlq mm0, 32
- paddw mm7, mm0
-
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- paddw mm7, mm2
- cmp rsi, rcx
-
- jne .x8x16sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x8sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- paddw mm0, mm2
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- cmp rsi, rcx
-
- jne .x8x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad4x4_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- pxor mm3, mm3
-
- punpcklbw mm0, mm3
- punpckhbw mm2, mm3
-
- paddw mm0, mm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movd mm4, DWORD PTR [rsi]
- movd mm5, DWORD PTR [rdi]
-
- movd mm6, DWORD PTR [rsi+rax]
- movd mm7, DWORD PTR [rdi+rdx]
-
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
-
- movq mm6, mm4
- psubusb mm4, mm5
-
- psubusb mm5, mm6
- por mm4, mm5
-
- movq mm5, mm4
- punpcklbw mm4, mm3
-
- punpckhbw mm5, mm3
- paddw mm4, mm5
-
- paddw mm0, mm4
- movq mm1, mm0
-
- punpcklwd mm0, mm3
- punpckhwd mm1, mm3
-
- paddw mm0, mm1
- movq mm1, mm0
-
- psrlq mm0, 32
- paddw mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad16x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x8sad_mmx_loop:
-
- movq mm0, [rsi]
- movq mm1, [rdi]
-
- movq mm2, [rsi+8]
- movq mm3, [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpckhbw mm1, mm6
-
- punpcklbw mm2, mm6
- punpckhbw mm3, mm6
-
-
- paddw mm0, mm2
- paddw mm1, mm3
-
- paddw mm0, mm1
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x16x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/sad_sse2.asm
+++ /dev/null
@@ -1,410 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_sad16x16_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp9_sad16x16_wmt)
-sym(vp9_sad16x16_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor xmm6, xmm6
-
-.x16x16sad_wmt_loop:
-
- movq xmm0, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rsi+8]
-
- movq xmm1, QWORD PTR [rdi]
- movq xmm3, QWORD PTR [rdi+8]
-
- movq xmm4, QWORD PTR [rsi+rax]
- movq xmm5, QWORD PTR [rdi+rdx]
-
-
- punpcklbw xmm0, xmm2
- punpcklbw xmm1, xmm3
-
- psadbw xmm0, xmm1
- movq xmm2, QWORD PTR [rsi+rax+8]
-
- movq xmm3, QWORD PTR [rdi+rdx+8]
- lea rsi, [rsi+rax*2]
-
- lea rdi, [rdi+rdx*2]
- punpcklbw xmm4, xmm2
-
- punpcklbw xmm5, xmm3
- psadbw xmm4, xmm5
-
- paddw xmm6, xmm0
- paddw xmm6, xmm4
-
- cmp rsi, rcx
- jne .x16x16sad_wmt_loop
-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movq rax, xmm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp9_sad8x16_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int max_err)
-global sym(vp9_sad8x16_wmt)
-sym(vp9_sad8x16_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
-
- lea rcx, [rcx+rbx*8]
- pxor mm7, mm7
-
-.x8x16sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- jg .x8x16sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, QWORD PTR [rsi+rbx]
- movq mm3, QWORD PTR [rdi+rdx]
-
- psadbw mm0, mm1
- psadbw mm2, mm3
-
- lea rsi, [rsi+rbx*2]
- lea rdi, [rdi+rdx*2]
-
- paddw mm7, mm0
- paddw mm7, mm2
-
- cmp rsi, rcx
- jne .x8x16sad_wmt_loop
-
- movq rax, mm7
-
-.x8x16sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x8_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp9_sad8x8_wmt)
-sym(vp9_sad8x8_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
- pxor mm7, mm7
-
-.x8x8sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- jg .x8x8sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- psadbw mm0, mm1
- lea rsi, [rsi+rbx]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x8x8sad_wmt_loop
-
- movq rax, mm7
-.x8x8sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp9_sad4x4_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp9_sad4x4_wmt)
-sym(vp9_sad4x4_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- psadbw mm0, mm1
- lea rsi, [rsi+rax*2]
-
- lea rdi, [rdi+rdx*2]
- movd mm4, DWORD PTR [rsi]
-
- movd mm5, DWORD PTR [rdi]
- movd mm6, DWORD PTR [rsi+rax]
-
- movd mm7, DWORD PTR [rdi+rdx]
- punpcklbw mm4, mm6
-
- punpcklbw mm5, mm7
- psadbw mm4, mm5
-
- paddw mm0, mm4
- movq rax, mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad16x8_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-global sym(vp9_sad16x8_wmt)
-sym(vp9_sad16x8_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rbx*8]
- pxor mm7, mm7
-
-.x16x8sad_wmt_loop:
-
- movq rax, mm7
- cmp eax, arg(4)
- jg .x16x8sad_wmt_early_exit
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, QWORD PTR [rsi+rbx]
- movq mm5, QWORD PTR [rdi+rdx]
-
- psadbw mm0, mm1
- psadbw mm2, mm3
-
- movq mm1, QWORD PTR [rsi+rbx+8]
- movq mm3, QWORD PTR [rdi+rdx+8]
-
- psadbw mm4, mm5
- psadbw mm1, mm3
-
- lea rsi, [rsi+rbx*2]
- lea rdi, [rdi+rdx*2]
-
- paddw mm0, mm2
- paddw mm4, mm1
-
- paddw mm7, mm0
- paddw mm7, mm4
-
- cmp rsi, rcx
- jne .x16x8sad_wmt_loop
-
- movq rax, mm7
-
-.x16x8sad_wmt_early_exit:
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_copy32xn_sse2(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp9_copy32xn_sse2)
-sym(vp9_copy32xn_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;dst_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;dst_stride
- movsxd rcx, dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- movdqu xmm2, XMMWORD PTR [rsi + rax]
- movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqu xmm4, XMMWORD PTR [rsi]
- movdqu xmm5, XMMWORD PTR [rsi + 16]
- movdqu xmm6, XMMWORD PTR [rsi + rax]
- movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- movdqa XMMWORD PTR [rdi + rdx], xmm2
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
-
- lea rdi, [rdi+rdx*2]
-
- movdqa XMMWORD PTR [rdi], xmm4
- movdqa XMMWORD PTR [rdi + 16], xmm5
- movdqa XMMWORD PTR [rdi + rdx], xmm6
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
-
- lea rdi, [rdi+rdx*2]
-
- sub rcx, 4
- cmp rcx, 4
- jge .block_copy_sse2_loopx4
-
- cmp rcx, 0
- je .copy_is_done
-
-.block_copy_sse2_loop:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- lea rsi, [rsi+rax]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- lea rdi, [rdi+rdx]
-
- sub rcx, 1
- jne .block_copy_sse2_loop
-
-.copy_is_done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/sad_sse3.asm
+++ /dev/null
@@ -1,960 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define max_err arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define max_err [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define max_err r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define max_err
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define r0_ptr rcx
- %define r1_ptr rdx
- %define r2_ptr rbx
- %define r3_ptr rdi
- %define ref_stride rbp
- %define result_ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
-
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
- mov rsi, arg(0) ; src_ptr
-
- movsxd rbx, dword ptr arg(1) ; src_stride
- movsxd rbp, dword ptr arg(3) ; ref_stride
-
- xchg rbx, rax
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define r0_ptr rsi
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr r8
- %define ref_stride r9
- %define result_ptr [rsp+xmm_stack_space+16+4*8]
- push rsi
-
- LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define r0_ptr r9
- %define r1_ptr r10
- %define r2_ptr r11
- %define r3_ptr rdx
- %define ref_stride rcx
- %define result_ptr r8
-
- LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
- %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
- %define src_ptr
- %define src_stride
- %define r0_ptr
- %define r1_ptr
- %define r2_ptr
- %define r3_ptr
- %define ref_stride
- %define result_ptr
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %ifidn __OUTPUT_FORMAT__,x64
- pop rsi
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm5, XMMWORD PTR [%3]
- lddqu xmm6, XMMWORD PTR [%3+1]
- lddqu xmm7, XMMWORD PTR [%3+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%3+1]
- lddqu xmm3, XMMWORD PTR [%3+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%4]
- lddqu xmm1, XMMWORD PTR [%3+%5]
- lddqu xmm2, XMMWORD PTR [%3+%5+1]
- lddqu xmm3, XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm5, QWORD PTR [%3]
- movq mm6, QWORD PTR [%3+1]
- movq mm7, QWORD PTR [%3+2]
-
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%3+1]
- movq mm3, QWORD PTR [%3+2]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endif
- movq mm0, QWORD PTR [%2+%4]
- movq mm1, QWORD PTR [%3+%5]
- movq mm2, QWORD PTR [%3+%5+1]
- movq mm3, QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*2]
-%endif
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
-%endmacro
-
-%macro LOAD_X4_ADDRESSES 5
- mov %2, [%1+REG_SZ_BYTES*0]
- mov %3, [%1+REG_SZ_BYTES*1]
-
- mov %4, [%1+REG_SZ_BYTES*2]
- mov %5, [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm4, XMMWORD PTR [%3]
- lddqu xmm5, XMMWORD PTR [%4]
- lddqu xmm6, XMMWORD PTR [%5]
- lddqu xmm7, XMMWORD PTR [%6]
-
- psadbw xmm4, xmm0
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [%2]
- lddqu xmm1, XMMWORD PTR [%3]
- lddqu xmm2, XMMWORD PTR [%4]
- lddqu xmm3, XMMWORD PTR [%5]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6]
- paddw xmm5, xmm2
- paddw xmm6, xmm3
-
- psadbw xmm1, xmm0
- paddw xmm7, xmm1
-%endif
- movdqa xmm0, XMMWORD PTR [%2+%7]
- lddqu xmm1, XMMWORD PTR [%3+%8]
- lddqu xmm2, XMMWORD PTR [%4+%8]
- lddqu xmm3, XMMWORD PTR [%5+%8]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [%6+%8]
- paddw xmm5, xmm2
- paddw xmm6, xmm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw xmm1, xmm0
- paddw xmm7, xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
- movq mm0, QWORD PTR [%2]
- movq mm4, QWORD PTR [%3]
- movq mm5, QWORD PTR [%4]
- movq mm6, QWORD PTR [%5]
- movq mm7, QWORD PTR [%6]
-
- psadbw mm4, mm0
- psadbw mm5, mm0
- psadbw mm6, mm0
- psadbw mm7, mm0
-%else
- movq mm0, QWORD PTR [%2]
- movq mm1, QWORD PTR [%3]
- movq mm2, QWORD PTR [%4]
- movq mm3, QWORD PTR [%5]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6]
- paddw mm5, mm2
- paddw mm6, mm3
-
- psadbw mm1, mm0
- paddw mm7, mm1
-%endif
- movq mm0, QWORD PTR [%2+%7]
- movq mm1, QWORD PTR [%3+%8]
- movq mm2, QWORD PTR [%4+%8]
- movq mm3, QWORD PTR [%5+%8]
-
- psadbw mm1, mm0
- psadbw mm2, mm0
- psadbw mm3, mm0
-
- paddw mm4, mm1
- movq mm1, QWORD PTR [%6+%8]
- paddw mm5, mm2
- paddw mm6, mm3
-
-%if %1==0 || %1==1
- lea %2, [%2+%7*2]
- lea %3, [%3+%8*2]
-
- lea %4, [%4+%8*2]
- lea %5, [%5+%8*2]
-
- lea %6, [%6+%8*2]
-%endif
- psadbw mm1, mm0
- paddw mm7, mm1
-
-%endmacro
-
-;void int vp9_sad16x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x16x3_sse3)
-sym(vp9_sad16x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad16x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x8x3_sse3)
-sym(vp9_sad16x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+8], xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad8x16x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x16x3_sse3)
-sym(vp9_sad8x16x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad8x8x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x8x3_sse3)
-sym(vp9_sad8x8x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
- mov rcx, result_ptr
-
- punpckldq mm5, mm6
-
- movq [rcx], mm5
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;void int vp9_sad4x4x3_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad4x4x3_sse3)
-sym(vp9_sad4x4x3_sse3):
-
- STACK_FRAME_CREATE_X3
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [ref_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [ref_ptr+1]
- movd mm5, DWORD PTR [ref_ptr+2]
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- psadbw mm4, mm0
- psadbw mm5, mm0
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [ref_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm6, DWORD PTR [ref_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm6
-
- movd mm3, DWORD PTR [ref_ptr+1]
- movd mm7, DWORD PTR [ref_ptr+2]
-
- psadbw mm2, mm0
-
- paddw mm1, mm2
-
- movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
- movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm6
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- mov rcx, result_ptr
-
- punpckldq mm1, mm3
-
- movq [rcx], mm1
- movd [rcx+8], mm7
-
- STACK_FRAME_DESTROY_X3
-
-;unsigned int vp9_sad16x16_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int max_err)
-;%define lddqu movdqu
-global sym(vp9_sad16x16_sse3)
-sym(vp9_sad16x16_sse3):
-
- STACK_FRAME_CREATE_X3
-
- mov end_ptr, 4
- pxor xmm7, xmm7
-
-.vp9_sad16x16_sse3_loop:
- movdqa xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [ref_ptr]
- movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
- movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- movdqa xmm4, XMMWORD PTR [src_ptr]
- movdqu xmm5, XMMWORD PTR [ref_ptr]
- movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
-
- psadbw xmm0, xmm1
-
- movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
-
- psadbw xmm2, xmm3
- psadbw xmm4, xmm5
- psadbw xmm6, xmm1
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea ref_ptr, [ref_ptr+ref_stride*2]
-
- paddw xmm7, xmm0
- paddw xmm7, xmm2
- paddw xmm7, xmm4
- paddw xmm7, xmm6
-
- sub end_ptr, 1
- jne .vp9_sad16x16_sse3_loop
-
- movq xmm0, xmm7
- psrldq xmm7, 8
- paddw xmm0, xmm7
- movq rax, xmm0
-
- STACK_FRAME_DESTROY_X3
-
-;void vp9_copy32xn_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp9_copy32xn_sse3)
-sym(vp9_copy32xn_sse3):
-
- STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
- lea end_ptr, [src_ptr+src_stride*2]
-
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
- movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
- movdqu xmm4, XMMWORD PTR [end_ptr]
- movdqu xmm5, XMMWORD PTR [end_ptr + 16]
- movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
- movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
-
- lea src_ptr, [src_ptr+src_stride*4]
-
- lea end_ptr, [ref_ptr+ref_stride*2]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
- movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
- movdqa XMMWORD PTR [end_ptr], xmm4
- movdqa XMMWORD PTR [end_ptr + 16], xmm5
- movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
- movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
- lea ref_ptr, [ref_ptr+ref_stride*4]
-
- sub height, 4
- cmp height, 4
- jge .block_copy_sse3_loopx4
-
- ;Check to see if there is more rows need to be copied.
- cmp height, 0
- je .copy_is_done
-
-.block_copy_sse3_loop:
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- lea src_ptr, [src_ptr+src_stride]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- lea ref_ptr, [ref_ptr+ref_stride]
-
- sub height, 1
- jne .block_copy_sse3_loop
-
-.copy_is_done:
- STACK_FRAME_DESTROY_X3
-
-;void vp9_sad16x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x16x4d_sse3)
-sym(vp9_sad16x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddw xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+12], xmm0
-
- STACK_FRAME_DESTROY_X4
-
-;void vp9_sad16x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr_base,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x8x4d_sse3)
-sym(vp9_sad16x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- movq xmm0, xmm4
- psrldq xmm4, 8
-
- paddw xmm0, xmm4
- movd [rcx], xmm0
-;-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rcx+4], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rcx+8], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rcx+12], xmm0
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x16x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x16x4d_sse3)
-sym(vp9_sad8x16x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad8x8x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad8x8x4d_sse3)
-sym(vp9_sad8x8x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
- pop rbp
-%endif
- mov rcx, result_ptr
-
- punpckldq mm4, mm5
- punpckldq mm6, mm7
-
- movq [rcx], mm4
- movq [rcx+8], mm6
-
- STACK_FRAME_DESTROY_X4
-
-;void int vp9_sad4x4x4d_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad4x4x4d_sse3)
-sym(vp9_sad4x4x4d_sse3):
-
- STACK_FRAME_CREATE_X4
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm1, DWORD PTR [r0_ptr]
-
- movd mm2, DWORD PTR [src_ptr+src_stride]
- movd mm3, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movd mm4, DWORD PTR [r1_ptr]
- movd mm5, DWORD PTR [r2_ptr]
-
- movd mm6, DWORD PTR [r3_ptr]
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
-
- movd mm3, DWORD PTR [r2_ptr+ref_stride]
- movd mm7, DWORD PTR [r3_ptr+ref_stride]
-
- psadbw mm1, mm0
-
- punpcklbw mm4, mm2
- punpcklbw mm5, mm3
-
- punpcklbw mm6, mm7
- psadbw mm4, mm0
-
- psadbw mm5, mm0
- psadbw mm6, mm0
-
-
-
- lea src_ptr, [src_ptr+src_stride*2]
- lea r0_ptr, [r0_ptr+ref_stride*2]
-
- lea r1_ptr, [r1_ptr+ref_stride*2]
- lea r2_ptr, [r2_ptr+ref_stride*2]
-
- lea r3_ptr, [r3_ptr+ref_stride*2]
-
- movd mm0, DWORD PTR [src_ptr]
- movd mm2, DWORD PTR [r0_ptr]
-
- movd mm3, DWORD PTR [src_ptr+src_stride]
- movd mm7, DWORD PTR [r0_ptr+ref_stride]
-
- punpcklbw mm0, mm3
- punpcklbw mm2, mm7
-
- movd mm3, DWORD PTR [r1_ptr]
- movd mm7, DWORD PTR [r2_ptr]
-
- psadbw mm2, mm0
-%if ABI_IS_32BIT
- mov rax, rbp
-
- pop rbp
-%define ref_stride rax
-%endif
- mov rsi, result_ptr
-
- paddw mm1, mm2
- movd [rsi], mm1
-
- movd mm2, DWORD PTR [r1_ptr+ref_stride]
- movd mm1, DWORD PTR [r2_ptr+ref_stride]
-
- punpcklbw mm3, mm2
- punpcklbw mm7, mm1
-
- psadbw mm3, mm0
- psadbw mm7, mm0
-
- movd mm2, DWORD PTR [r3_ptr]
- movd mm1, DWORD PTR [r3_ptr+ref_stride]
-
- paddw mm3, mm4
- paddw mm7, mm5
-
- movd [rsi+4], mm3
- punpcklbw mm2, mm1
-
- movd [rsi+8], mm7
- psadbw mm2, mm0
-
- paddw mm2, mm6
- movd [rsi+12], mm2
-
-
- STACK_FRAME_DESTROY_X4
-
--- a/vp8/encoder/x86/sad_sse4.asm
+++ /dev/null
@@ -1,353 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm1, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- movq xmm2, MMWORD PTR [rdi+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
-
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endif
- movdqa xmm0, XMMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- movq xmm2, MMWORD PTR [rdi+ rdx+16]
- punpcklqdq xmm5, xmm3
- punpcklqdq xmm3, xmm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
-
- psrldq xmm0, 8
- movdqa xmm4, xmm3
- mpsadbw xmm3, xmm0, 0x0
- mpsadbw xmm4, xmm0, 0x5
-
- paddw xmm5, xmm2
- paddw xmm5, xmm3
- paddw xmm5, xmm4
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
- movq xmm0, MMWORD PTR [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm1
- mpsadbw xmm1, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm1, xmm2
-%else
- movq xmm0, MMWORD PTR [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endif
- movq xmm0, MMWORD PTR [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movdqa xmm2, xmm5
- mpsadbw xmm5, xmm0, 0x0
- mpsadbw xmm2, xmm0, 0x5
- paddw xmm5, xmm2
-
- paddw xmm1, xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
- movd xmm0, [rsi]
- movq xmm1, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm1, xmm3
-
- mpsadbw xmm1, xmm0, 0x0
-%else
- movd xmm0, [rsi]
- movq xmm5, MMWORD PTR [rdi]
- movq xmm3, MMWORD PTR [rdi+8]
- punpcklqdq xmm5, xmm3
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endif
- movd xmm0, [rsi + rax]
- movq xmm5, MMWORD PTR [rdi+ rdx]
- movq xmm3, MMWORD PTR [rdi+ rdx+8]
- punpcklqdq xmm5, xmm3
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- mpsadbw xmm5, xmm0, 0x0
-
- paddw xmm1, xmm5
-%endmacro
-
-
-;void vp9_sad16x16x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4)
-sym(vp9_sad16x16x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_sad16x8x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp9_sad16x8x8_sse4)
-sym(vp9_sad16x8x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_sad8x8x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp9_sad8x8x8_sse4)
-sym(vp9_sad8x8x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_sad8x16x8_sse4(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp9_sad8x16x8_sse4)
-sym(vp9_sad8x16x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_sad4x4x8_c(
-; const unsigned char *src_ptr,
-; int src_stride,
-; const unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned short *sad_array
-;);
-global sym(vp9_sad4x4x8_sse4)
-sym(vp9_sad4x4x8_sse4):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
-
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,370 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm7, XMMWORD PTR [rdi+16]
-
- movdqa xmm5, xmm7
- palignr xmm5, xmm4, %2
-
- movdqa xmm6, xmm7
- palignr xmm6, xmm4, (%2+1)
-
- palignr xmm7, xmm4, (%2+2)
-
- psadbw xmm5, xmm0
- psadbw xmm6, xmm0
- psadbw xmm7, xmm0
-%else
- movdqa xmm0, XMMWORD PTR [rsi]
- movdqa xmm4, XMMWORD PTR [rdi]
- movdqa xmm3, XMMWORD PTR [rdi+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- movdqa xmm4, XMMWORD PTR [rdi+rdx]
- movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
-
- movdqa xmm1, xmm3
- palignr xmm1, xmm4, %2
-
- movdqa xmm2, xmm3
- palignr xmm2, xmm4, (%2+1)
-
- palignr xmm3, xmm4, (%2+2)
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
-
- paddw xmm5, xmm1
- paddw xmm6, xmm2
- paddw xmm7, xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
- sub rdi, %1
-
- PROCESS_16X2X3_OFFSET 1, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
- PROCESS_16X2X3_OFFSET 0, %1
-
- jmp %2_store_off
-
-%endmacro
-
-;void int vp9_sad16x16x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x16x3_ssse3)
-sym(vp9_sad16x16x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vp9_sad16x16x3_ssse3_skiptable
-.vp9_sad16x16x3_ssse3_jumptable:
- dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
- dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_skiptable:
-
- call .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
-
-.vp9_sad16x16x3_ssse3_aligned_by_15:
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vp9_sad16x16x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void int vp9_sad16x8x3_ssse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; int *results)
-global sym(vp9_sad16x8x3_ssse3)
-sym(vp9_sad16x8x3_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- push rcx
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rdx, 0xf
- and rdx, rdi
-
- jmp .vp9_sad16x8x3_ssse3_skiptable
-.vp9_sad16x8x3_ssse3_jumptable:
- dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
- dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_skiptable:
-
- call .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_do_jump:
- pop rcx ; get the address of do_jump
- mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
- add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
-
- movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
- add rcx, rax
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- jmp rcx
-
- PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
-
-.vp9_sad16x8x3_ssse3_aligned_by_15:
-
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
-
-.vp9_sad16x8x3_ssse3_store_off:
- mov rdi, arg(4) ;Results
-
- movq xmm0, xmm5
- psrldq xmm5, 8
-
- paddw xmm0, xmm5
- movd [rdi], xmm0
-;-
- movq xmm0, xmm6
- psrldq xmm6, 8
-
- paddw xmm0, xmm6
- movd [rdi+4], xmm0
-;-
- movq xmm0, xmm7
- psrldq xmm7, 8
-
- paddw xmm0, xmm7
- movd [rdi+8], xmm0
-
- ; begin epilog
- pop rcx
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/ssim_opt.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
- paddusw xmm15, xmm3 ; sum_s
- paddusw xmm14, xmm4 ; sum_r
- movdqa xmm1, xmm3
- pmaddwd xmm1, xmm1
- paddd xmm13, xmm1 ; sum_sq_s
- movdqa xmm2, xmm4
- pmaddwd xmm2, xmm2
- paddd xmm12, xmm2 ; sum_sq_r
- pmaddwd xmm3, xmm4
- paddd xmm11, xmm3 ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
- movdqa xmm2,%1
- punpckldq %1,xmm0
- punpckhdq xmm2,xmm0
- paddq %1,xmm2
- movdqa xmm2,%1
- punpcklqdq %1,xmm0
- punpckhqdq xmm2,xmm0
- paddq %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
- movdqa xmm1, %1
- punpcklwd %1,xmm0
- punpckhwd xmm1,xmm0
- paddd %1, xmm1
- SUM_ACROSS_Q %1
-%endmacro
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; unsigned long *sum_s,
-; unsigned long *sum_r,
-; unsigned long *sum_sq_s,
-; unsigned long *sum_sq_r,
-; unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2)
-sym(vp9_ssim_parms_16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 16 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movdqu xmm5, [rsi]
- movdqu xmm6, [rdi]
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpckhbw xmm3, xmm0 ; high_s
- punpckhbw xmm4, xmm0 ; high_r
-
- TABULATE_SSIM
-
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; unsigned long *sum_s,
-; unsigned long *sum_r,
-; unsigned long *sum_sq_s,
-; unsigned long *sum_sq_r,
-; unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2)
-sym(vp9_ssim_parms_8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 8 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movq xmm3, [rsi]
- movq xmm4, [rdi]
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ /dev/null
@@ -1,432 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_mmx_impl)
-sym(vp9_subtract_b_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi], mm0
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2],mm0
-
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx)
-sym(vp9_subtract_mby_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 16
- pxor mm0, mm0
-
-.submby_loop:
-
- movq mm1, [rsi]
- movq mm3, [rax]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi], mm1
- movq [rdi+8], mm2
-
-
- movq mm1, [rsi+8]
- movq mm3, [rax+8]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi+16], mm1
- movq [rdi+24], mm2
-
-
- add rdi, 32
- add rax, 16
-
- lea rsi, [rsi+rdx]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx)
-sym(vp9_subtract_mbuv_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- ;short *udiff = diff + 256;
- ;short *vdiff = diff + 320;
- ;unsigned char *upred = pred + 256;
- ;unsigned char *vpred = pred + 320;
-
- ;unsigned char *z = usrc;
- ;unsigned short *diff = udiff;
- ;unsigned char *Predictor= upred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ;unsigned char *z = vsrc;
- ;unsigned short *diff = vdiff;
- ;unsigned char *Predictor= vpred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(2) ;z = usrc
- add rdi, 320*2 ;diff = diff + 320 (shorts)
- add rax, 320 ;Predictor = pred + 320
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,356 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_sse2_impl)
-sym(vp9_subtract_b_sse2_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi], mm0
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2)
-sym(vp9_subtract_mby_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 8 ; do two lines at one time
-
-.submby_loop:
- movdqa xmm0, XMMWORD PTR [rsi] ; src
- movdqa xmm1, XMMWORD PTR [rax] ; pred
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- movdqa xmm4, XMMWORD PTR [rsi + rdx]
- movdqa xmm5, XMMWORD PTR [rax + 16]
-
- movdqa xmm6, xmm4
- psubb xmm4, xmm5
-
- pxor xmm5, [GLOBAL(t80)] ;convert to signed values
- pxor xmm6, [GLOBAL(t80)]
- pcmpgtb xmm5, xmm6 ; obtain sign information
-
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- punpcklbw xmm4, xmm5 ; put sign back to subtraction
- punpckhbw xmm6, xmm7 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi +32], xmm4
- movdqa XMMWORD PTR [rdi +48], xmm6
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2)
-sym(vp9_subtract_mbuv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- lea rcx, [rdx + rdx*2]
-
- ;u
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ;v
- mov rsi, arg(2) ;z = vsrc
- add rdi, 64*2 ;diff = diff + 320 (shorts)
- add rax, 64 ;Predictor = pred + 320
-
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-t80:
- times 16 db 0x80
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,207 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-; (unsigned char *frame1, | 0
-; unsigned int stride, | 1
-; unsigned char *frame2, | 2
-; unsigned int block_size, | 3
-; int strength, | 4
-; int filter_weight, | 5
-; unsigned int *accumulator, | 6
-; unsigned short *count) | 7
-global sym(vp9_temporal_filter_apply_sse2)
-sym(vp9_temporal_filter_apply_sse2):
-
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ALIGN_STACK 16, rax
- %define block_size 0
- %define strength 16
- %define filter_weight 32
- %define rounding_bit 48
- %define rbp_backup 64
- %define stack_size 80
- sub rsp, stack_size
- mov [rsp + rbp_backup], rbp
- ; end prolog
-
- mov rdx, arg(3)
- mov [rsp + block_size], rdx
- movd xmm6, arg(4)
- movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
- ; calculate the rounding bit outside the loop
- ; 0x8000 >> (16 - strength)
- mov rdx, 16
- sub rdx, arg(4) ; 16 - strength
- movd xmm4, rdx ; can't use rdx w/ shift
- movdqa xmm5, [GLOBAL(_const_top_bit)]
- psrlw xmm5, xmm4
- movdqa [rsp + rounding_bit], xmm5
-
- mov rsi, arg(0) ; src/frame1
- mov rdx, arg(2) ; predictor frame
- mov rdi, arg(6) ; accumulator
- mov rax, arg(7) ; count
-
- ; dup the filter weight and store for later
- movd xmm0, arg(5) ; filter_weight
- pshuflw xmm0, xmm0, 0
- punpcklwd xmm0, xmm0
- movdqa [rsp + filter_weight], xmm0
-
- mov rbp, arg(1) ; stride
- pxor xmm7, xmm7 ; zero for extraction
-
- lea rcx, [rdx + 16*16*1]
- cmp dword ptr [rsp + block_size], 8
- jne .temporal_filter_apply_load_16
- lea rcx, [rdx + 8*8*1]
-
-.temporal_filter_apply_load_8:
- movq xmm0, [rsi] ; first row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- movq xmm1, [rsi] ; second row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
- movdqa xmm0, [rsi] ; src (frame1)
- lea rsi, [rsi + rbp] ; += stride
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- punpckhbw xmm1, xmm7 ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
- movdqa xmm2, [rdx] ; predictor (frame2)
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7 ; pred[ 0- 7]
- punpckhbw xmm3, xmm7 ; pred[ 8-15]
-
- ; modifier = src_byte - pixel_value
- psubw xmm0, xmm2 ; src - pred[ 0- 7]
- psubw xmm1, xmm3 ; src - pred[ 8-15]
-
- ; modifier *= modifier
- pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
- pmullw xmm1, xmm1 ; modifer[ 8-15]^2
-
- ; modifier *= 3
- pmullw xmm0, [GLOBAL(_const_3w)]
- pmullw xmm1, [GLOBAL(_const_3w)]
-
- ; modifer += 0x8000 >> (16 - strength)
- paddw xmm0, [rsp + rounding_bit]
- paddw xmm1, [rsp + rounding_bit]
-
- ; modifier >>= strength
- psrlw xmm0, [rsp + strength]
- psrlw xmm1, [rsp + strength]
-
- ; modifier = 16 - modifier
- ; saturation takes care of modifier > 16
- movdqa xmm3, [GLOBAL(_const_16w)]
- movdqa xmm2, [GLOBAL(_const_16w)]
- psubusw xmm3, xmm1
- psubusw xmm2, xmm0
-
- ; modifier *= filter_weight
- pmullw xmm2, [rsp + filter_weight]
- pmullw xmm3, [rsp + filter_weight]
-
- ; count
- movdqa xmm4, [rax]
- movdqa xmm5, [rax+16]
- ; += modifier
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- ; write back
- movdqa [rax], xmm4
- movdqa [rax+16], xmm5
- lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
-
- ; load and extract the predictor up to shorts
- pxor xmm7, xmm7
- movdqa xmm0, [rdx]
- lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; pred[ 0- 7]
- punpckhbw xmm1, xmm7 ; pred[ 8-15]
-
- ; modifier *= pixel_value
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- ; expand to double words
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm7 ; [ 0- 3]
- punpckhwd xmm2, xmm7 ; [ 4- 7]
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm7 ; [ 8-11]
- punpckhwd xmm3, xmm7 ; [12-15]
-
- ; accumulator
- movdqa xmm4, [rdi]
- movdqa xmm5, [rdi+16]
- movdqa xmm6, [rdi+32]
- movdqa xmm7, [rdi+48]
- ; += modifier
- paddd xmm4, xmm0
- paddd xmm5, xmm2
- paddd xmm6, xmm1
- paddd xmm7, xmm3
- ; write back
- movdqa [rdi], xmm4
- movdqa [rdi+16], xmm5
- movdqa [rdi+32], xmm6
- movdqa [rdi+48], xmm7
- lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
- cmp rdx, rcx
- je .temporal_filter_apply_epilog
- pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_size], 16
- je .temporal_filter_apply_load_16
- jmp .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
- ; begin epilog
- mov rbp, [rsp + rbp_backup]
- add rsp, stack_size
- pop rsp
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-_const_3w:
- times 8 dw 3
-align 16
-_const_top_bit:
- times 8 dw 1<<15
-align 16
-_const_16w
- times 8 dw 16
--- a/vp8/encoder/x86/temporal_filter_x86.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_TEMPORAL_FILTER_X86_H
-#define __INC_TEMPORAL_FILTER_X86_H
-
-#if HAVE_SSE2
-extern prototype_apply(vp9_temporal_filter_apply_sse2);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_temporal_filter_apply
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
-
-#endif
-
-#endif
-
-#endif // __INC_TEMPORAL_FILTER_X86_H
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ /dev/null
@@ -1,851 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx)
-sym(vp9_get_mb_ss_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 8
- ; end prolog
-
- mov rax, arg(0) ;src_ptr
- mov rcx, 16
- pxor mm4, mm4
-
-.NEXTROW:
- movq mm0, [rax]
- movq mm1, [rax+8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
-
- paddd mm4, mm0
- paddd mm4, mm1
- paddd mm4, mm2
- paddd mm4, mm3
-
- add rax, 32
- dec rcx
- ja .NEXTROW
- movq QWORD PTR [rsp], mm4
-
- ;return sum[0]+sum[1];
- movsxd rax, dword ptr [rsp]
- movsxd rcx, dword ptr [rsp+4]
- add rax, rcx
-
-
- ; begin epilog
- add rsp, 8
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get8x8var_mmx)
-sym(vp9_get8x8var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 5
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- ; movq mm4, [rbx + rdx]
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 6
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 7
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 8
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get4x4var_mmx)
-sym(vp9_get4x4var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx)
-sym(vp9_get4x4sse_cs_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- push rbx
- ; end prolog
-
-
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
- ; Row 1
- movd mm0, [rax] ; Copy eight bytes to mm0
- movd mm1, [rbx] ; Copy eight bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 2
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm1, mm6
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
- movq mm0, mm7 ;
- psrlq mm7, 32
-
- paddd mm0, mm7
- movq rax, mm0
-
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%define mmx_filter_shift 7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx)
-sym(vp9_filter_block2d_bil4x4_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
-
- mov rax, arg(4) ;HFilter ;
- mov rdx, arg(5) ;VFilter ;
-
- mov rsi, arg(0) ;ref_ptr ;
- mov rdi, arg(2) ;src_ptr ;
-
- mov rcx, 4 ;
- pxor mm0, mm0 ;
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm5, mm1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm3, mm5 ;
-
- movq mm5, mm1 ;
- pmullw mm3, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- paddw mm1, mm3 ;
-
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- movd mm3, [rdi] ;
- punpcklbw mm3, mm0 ;
-
- psubw mm1, mm3 ;
- paddw mm6, mm1 ;
-
- pmaddwd mm1, mm1 ;
- paddd mm7, mm1 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil4x4_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(6) ;sum
- mov rsi, arg(7) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx)
-sym(vp9_filter_block2d_bil_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov rax, arg(5) ;HFilter ;
-
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor mm0, mm0 ;
- movq mm1, [rsi] ;
-
- movq mm3, [rsi+1] ;
- movq mm2, mm1 ;
-
- movq mm4, mm3 ;
- punpcklbw mm1, mm0 ;
-
- punpckhbw mm2, mm0 ;
- pmullw mm1, [rax] ;
-
- pmullw mm2, [rax] ;
- punpcklbw mm3, mm0 ;
-
- punpckhbw mm4, mm0 ;
- pmullw mm3, [rax+8] ;
-
- pmullw mm4, [rax+8] ;
- paddw mm1, mm3 ;
-
- paddw mm2, mm4 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm2, mmx_filter_shift ;
- movq mm5, mm1
-
- packuswb mm5, mm2 ;
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- add rsi, r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
- movq mm1, [rsi] ;
- movq mm3, [rsi+1] ;
-
- movq mm2, mm1 ;
- movq mm4, mm3 ;
-
- punpcklbw mm1, mm0 ;
- punpckhbw mm2, mm0 ;
-
- pmullw mm1, [rax] ;
- pmullw mm2, [rax] ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, [rax+8] ;
- pmullw mm4, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, mm5 ;
- movq mm4, mm5 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- movq mm5, mm1 ;
- packuswb mm5, mm2 ;
-
- pmullw mm3, [rdx] ;
- pmullw mm4, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- pmullw mm2, [rdx+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, [rdi] ;
- movq mm4, mm3 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- psubw mm1, mm3 ;
- psubw mm2, mm4 ;
-
- paddw mm6, mm1 ;
- pmaddwd mm1, mm1 ;
-
- paddw mm6, mm2 ;
- pmaddwd mm2, mm2 ;
-
- paddd mm7, mm1 ;
- paddd mm7, mm2 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(7) ;sum
- mov rsi, arg(8) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
- times 4 dw 64
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ /dev/null
@@ -1,1367 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-; short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2)
-sym(vp9_get_mb_ss_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 1
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- mov rax, arg(0) ;[src_ptr]
- mov rcx, 8
- pxor xmm4, xmm4
-
-.NEXTROW:
- movdqa xmm0, [rax]
- movdqa xmm1, [rax+16]
- movdqa xmm2, [rax+32]
- movdqa xmm3, [rax+48]
- pmaddwd xmm0, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
- paddd xmm4, xmm0
- paddd xmm4, xmm2
-
- add rax, 0x40
- dec rcx
- ja .NEXTROW
-
- movdqa xmm3,xmm4
- psrldq xmm4,8
- paddd xmm4,xmm3
- movdqa xmm3,xmm4
- psrldq xmm4,4
- paddd xmm4,xmm3
- movq rax,xmm4
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get16x16var_sse2)
-sym(vp9_get16x16var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- ; Prefetch data
- lea rcx, [rax+rax*2]
- prefetcht0 [rsi]
- prefetcht0 [rsi+rax]
- prefetcht0 [rsi+rax*2]
- prefetcht0 [rsi+rcx]
- lea rbx, [rsi+rax*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax*2]
- prefetcht0 [rbx+rcx]
-
- lea rcx, [rdx+rdx*2]
- prefetcht0 [rdi]
- prefetcht0 [rdi+rdx]
- prefetcht0 [rdi+rdx*2]
- prefetcht0 [rdi+rcx]
- lea rbx, [rdi+rdx*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx*2]
- prefetcht0 [rbx+rcx]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-.var16loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- prefetcht0 [rsi+rax*8]
- prefetcht0 [rdi+rdx*8]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz .var16loop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movd DWORD PTR [rax], xmm7
- movd DWORD PTR [rdi], xmm1
-
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get8x8var_sse2)
-sym(vp9_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2)
-sym(vp9_filter_block2d_bil_var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- pxor xmm6, xmm6 ;
- pxor xmm7, xmm7 ;
-
- lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
- movdqa xmm4, XMMWORD PTR [rsi]
-
- lea rcx, [GLOBAL(bilinear_filters_sse2)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_sse2_sp_only
-
- shl rax, 5 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_sse2_fp_only
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
- movdqa xmm5, xmm1
-
- movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
- lea rsi, [rsi + rbx]
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movdqa xmm3, xmm5 ;
- movdqa xmm5, xmm1 ;
-
- pmullw xmm3, [rdx] ;
- pmullw xmm1, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rbx] ;ref_pixels_per_line
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_var_sse2_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
- je filter_block2d_bil_var_sse2_full_pixel
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
- movq xmm3, QWORD PTR [rsi] ;
- punpcklbw xmm3, xmm0 ;
- movdqa xmm5, xmm3
-
- pmullw xmm1, [rdx] ;
- pmullw xmm3, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- movdqa xmm1, xmm5 ;
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_sp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0 ;
-
-filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movq xmm2, QWORD PTR [rdi] ;
- punpcklbw xmm2, xmm0 ;
-
- psubw xmm1, xmm2 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_full_pixel_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
- lea rsi, [rsi + rdx]
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_fp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(7) ; sum
- mov rdi, arg(8) ; sumsquared
-
- movd [rsi], mm2 ; xsum
- movd [rdi], mm4 ; xxsum
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_half_horiz_vert_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance8x_h_sse2)
-sym(vp9_half_horiz_vert_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
-%else
- add rsi, r8
-%endif
-
-.half_horiz_vert_variance8x_h_1:
-
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm2, QWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
-
- sub rcx, 1 ;
- jnz .half_horiz_vert_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_half_horiz_vert_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2)
-sym(vp9_half_horiz_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
- movdqu xmm5, XMMWORD PTR [rsi]
- movdqu xmm3, XMMWORD PTR [rsi+1]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
- lea rsi, [rsi + rax]
-
-.half_horiz_vert_variance16x_h_1:
- movdqu xmm1, XMMWORD PTR [rsi] ;
- movdqu xmm2, XMMWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
-
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm4, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- psubw xmm5, xmm3 ; xmm5 -= xmm3
-
- movq xmm3, QWORD PTR [rdi+8]
- punpcklbw xmm3, xmm0
- psubw xmm4, xmm3
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz .half_horiz_vert_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_half_vert_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance8x_h_sse2)
-sym(vp9_half_vert_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
-.half_vert_variance8x_h_1:
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
-
- sub rcx, 1 ;
- jnz .half_vert_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_half_vert_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance16x_h_sse2)
-sym(vp9_half_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr
-
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- movdqu xmm5, XMMWORD PTR [rsi]
- lea rsi, [rsi + rax ]
- pxor xmm0, xmm0
-
-.half_vert_variance16x_h_1:
- movdqu xmm3, XMMWORD PTR [rsi]
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0
- punpckhbw xmm4, xmm0
-
- movq xmm2, QWORD PTR [rdi]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm2
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm4, xmm2
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm3
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1
- jnz .half_vert_variance16x_h_1
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_half_horiz_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance8x_h_sse2)
-sym(vp9_half_horiz_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor xmm0, xmm0 ;
-.half_horiz_variance8x_h_1:
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .half_horiz_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_half_horiz_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance16x_h_sse2)
-sym(vp9_half_horiz_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
-.half_horiz_variance16x_h_1:
- movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
- movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm1, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm1, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- psubw xmm1, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm1
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm1, xmm1
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz .half_horiz_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
- dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3)
-sym(vp9_filter_block2d_bil_var_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .filter_block2d_bil_var_ssse3_sp_only
-
- shl rax, 4 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je .filter_block2d_bil_var_ssse3_fp_only
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi+1]
- movdqa xmm2, xmm0
-
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, [rax]
- pmaddubsw xmm2, [rax]
-
- paddw xmm0, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm0, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- packuswb xmm0, xmm2
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
- packuswb xmm1, xmm3
-
- movdqa xmm2, xmm0
- movdqa xmm0, xmm1
- movdqa xmm3, xmm2
-
- punpcklbw xmm2, xmm1
- punpckhbw xmm3, xmm1
- pmaddubsw xmm2, [rdx]
- pmaddubsw xmm3, [rdx]
-
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm2, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm1, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm1, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm2, xmm1
- psubw xmm3, xmm5
- paddw xmm6, xmm2
- paddw xmm6, xmm3
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm2
- paddd xmm7, xmm3
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rsi, [rsi + r8]
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_var_ssse3_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je .filter_block2d_bil_var_ssse3_full_pixel
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqa xmm0, xmm1
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- lea rsi, [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
- movdqu xmm3, XMMWORD PTR [rsi]
- movdqa xmm2, xmm1
- movdqa xmm0, xmm3
-
- punpcklbw xmm1, xmm3
- punpckhbw xmm2, xmm3
- pmaddubsw xmm1, [rdx]
- pmaddubsw xmm2, [rdx]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- movq xmm3, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm3, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm3
- psubw xmm2, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- movdqa xmm1, xmm0
- lea rsi, [rsi + rax] ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_sp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- movq xmm2, QWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
-
- movq xmm3, QWORD PTR [rdi]
- punpcklbw xmm3, xmm0
- movq xmm4, QWORD PTR [rdi+8]
- punpcklbw xmm4, xmm0
-
- psubw xmm1, xmm3
- psubw xmm2, xmm4
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rdx] ;src_pixels_per_line
- sub rcx, 1
- jnz .filter_block2d_bil_full_pixel_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm2, XMMWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm2, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm2
- psubw xmm3, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm3
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm1
- paddd xmm7, xmm3
-
- lea rsi, [rsi + rdx]
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_fp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(7) ;[Sum]
- mov rdi, arg(8) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
--- a/vp8/encoder/x86/variance_mmx.c
+++ /dev/null
@@ -1,406 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-extern void filter_block1d_h6_mmx
-(
- const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
- const short *src_ptr,
- unsigned char *output_ptr,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp9_get8x8var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-extern unsigned int vp9_get4x4var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
-
-unsigned int vp9_variance4x4_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 4));
-
-}
-
-unsigned int vp9_variance8x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
-
- return (var - ((avg * avg) >> 6));
-
-}
-
-unsigned int vp9_mse16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3;
-
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- *sse = var;
- return var;
-}
-
-
-unsigned int vp9_variance16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
- *sse = var;
- return (var - ((avg * avg) >> 8));
-}
-
-unsigned int vp9_variance16x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp9_variance8x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
-
- return (var - ((avg * avg) >> 7));
-
-}
-
-
-
-
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
- { 128, 128, 128, 128, 0, 0, 0, 0 },
- { 120, 120, 120, 120, 8, 8, 8, 8 },
- { 112, 112, 112, 112, 16, 16, 16, 16 },
- { 104, 104, 104, 104, 24, 24, 24, 24 },
- { 96, 96, 96, 96, 32, 32, 32, 32 },
- { 88, 88, 88, 88, 40, 40, 40, 40 },
- { 80, 80, 80, 80, 48, 48, 48, 48 },
- { 72, 72, 72, 72, 56, 56, 56, 56 },
- { 64, 64, 64, 64, 64, 64, 64, 64 },
- { 56, 56, 56, 56, 72, 72, 72, 72 },
- { 48, 48, 48, 48, 80, 80, 80, 80 },
- { 40, 40, 40, 40, 88, 88, 88, 88 },
- { 32, 32, 32, 32, 96, 96, 96, 96 },
- { 24, 24, 24, 24, 104, 104, 104, 104 },
- { 16, 16, 16, 16, 112, 112, 112, 112 },
- { 8, 8, 8, 8, 120, 120, 120, 120 }
-};
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-
-{
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
- ref_ptr, recon_stride, sse);
-}
--- a/vp8/encoder/x86/variance_sse2.c
+++ /dev/null
@@ -1,517 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
-extern unsigned int vp9_get4x4var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-
-unsigned int vp9_get_mb_ss_sse2
-(
- const short *src_ptr
-);
-unsigned int vp9_get16x16var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-unsigned int vp9_get8x8var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-void vp9_filter_block2d_bil_var_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_horiz_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_vert_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp9_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-
-DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_variance4x4_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 4));
-
-}
-
-unsigned int vp9_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
- return (var - ((avg * avg) >> 6));
-
-}
-
-
-unsigned int vp9_variance16x16_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0;
- int sum0;
-
-
- vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- *sse = sse0;
- return (sse0 - ((sum0 * sum0) >> 8));
-}
-unsigned int vp9_mse16x16_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
-
- unsigned int sse0;
- int sum0;
- vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- *sse = sse0;
- return sse0;
-
-}
-
-
-unsigned int vp9_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp9_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum1, &xxsum1
- );
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum1, &xxsum1);
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - ((xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- int xsum0;
- unsigned int xxsum0;
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
--- a/vp8/encoder/x86/variance_ssse3.c
+++ /dev/null
@@ -1,151 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/common/pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern unsigned int vp9_get16x16var_sse2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0;
- unsigned int xxsum0;
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0;
- unsigned int xxsum0;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ /dev/null
@@ -1,114 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-
-#if HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
- vp9_short_fdct4x4_mmx(input, output, pitch);
- vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
- short *coeff_ptr = mb->block[0].coeff;
- short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
- return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
-}
-
-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_mmx(MACROBLOCK *mb) {
- short *s_ptr = &mb->coeff[256];
- short *d_ptr = &mb->e_mbd.dqcoeff[256];
- return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
- vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if HAVE_SSE2
-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
- short *coeff_ptr = mb->block[0].coeff;
- short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
- return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
-}
-
-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_xmm(MACROBLOCK *mb) {
- short *s_ptr = &mb->coeff[256];
- short *d_ptr = &mb->e_mbd.dqcoeff[256];
- return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
-}
-
-void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
- vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
- int flags = x86_simd_caps();
-
- /* Note:
- *
- * This platform can be built without runtime CPU detection as well. If
- * you modify any of the function mappings present in this file, be sure
- * to also update them in static mapings (<arch>/filename_<arch>.h)
- */
-
- /* Override default functions with fastest ones for this CPU. */
-#if HAVE_SSE2
- if (flags & HAS_SSE2) {
- cpi->rtcd.temporal.apply = vp9_temporal_filter_apply_sse2;
-
- }
-#endif
-
-#if HAVE_SSE3
- if (flags & HAS_SSE3) {
- cpi->rtcd.search.full_search = vp9_full_search_sadx3;
- cpi->rtcd.search.diamond_search = vp9_diamond_search_sadx4;
- cpi->rtcd.search.refining_search = vp9_refining_search_sadx4;
- }
-#endif
-
-
-#if HAVE_SSE4_1
- if (flags & HAS_SSE4_1) {
- cpi->rtcd.search.full_search = vp9_full_search_sadx8;
- }
-#endif
-
-#endif
-}
--- a/vp8/exports_dec
+++ /dev/null
@@ -1,2 +1,0 @@
-data vpx_codec_vp8_dx_algo
-text vpx_codec_vp8_dx
--- a/vp8/exports_enc
+++ /dev/null
@@ -1,4 +1,0 @@
-data vpx_codec_vp8_cx_algo
-text vpx_codec_vp8_cx
-data vpx_codec_vp8x_cx_algo
-text vpx_codec_vp8x_cx
--- a/vp8/vp8_common.mk
+++ /dev/null
@@ -1,179 +1,0 @@
-##
-## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-
-VP8_COMMON_SRCS-yes += vp8_common.mk
-VP8_COMMON_SRCS-yes += common/type_aliases.h
-VP8_COMMON_SRCS-yes += common/pragmas.h
-VP8_COMMON_SRCS-yes += common/ppflags.h
-VP8_COMMON_SRCS-yes += common/onyx.h
-VP8_COMMON_SRCS-yes += common/onyxd.h
-VP8_COMMON_SRCS-yes += common/alloccommon.c
-VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
-VP8_COMMON_SRCS-yes += common/blockd.c
-VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
-VP8_COMMON_SRCS-yes += common/debugmodes.c
-VP8_COMMON_SRCS-yes += common/entropy.c
-VP8_COMMON_SRCS-yes += common/entropymode.c
-VP8_COMMON_SRCS-yes += common/entropymv.c
-VP8_COMMON_SRCS-yes += common/extend.c
-VP8_COMMON_SRCS-yes += common/filter.c
-VP8_COMMON_SRCS-yes += common/filter.h
-VP8_COMMON_SRCS-yes += common/findnearmv.c
-VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
-VP8_COMMON_SRCS-yes += common/idctllm.c
-VP8_COMMON_SRCS-yes += common/alloccommon.h
-VP8_COMMON_SRCS-yes += common/blockd.h
-VP8_COMMON_SRCS-yes += common/common.h
-VP8_COMMON_SRCS-yes += common/common_types.h
-VP8_COMMON_SRCS-yes += common/entropy.h
-VP8_COMMON_SRCS-yes += common/entropymode.h
-VP8_COMMON_SRCS-yes += common/entropymv.h
-VP8_COMMON_SRCS-yes += common/extend.h
-VP8_COMMON_SRCS-yes += common/findnearmv.h
-VP8_COMMON_SRCS-yes += common/header.h
-VP8_COMMON_SRCS-yes += common/idct.h
-VP8_COMMON_SRCS-yes += common/invtrans.h
-VP8_COMMON_SRCS-yes += common/loopfilter.h
-VP8_COMMON_SRCS-yes += common/modecont.h
-VP8_COMMON_SRCS-yes += common/mv.h
-VP8_COMMON_SRCS-yes += common/onyxc_int.h
-VP8_COMMON_SRCS-yes += common/pred_common.h
-VP8_COMMON_SRCS-yes += common/pred_common.c
-VP8_COMMON_SRCS-yes += common/quant_common.h
-VP8_COMMON_SRCS-yes += common/reconinter.h
-VP8_COMMON_SRCS-yes += common/reconintra.h
-VP8_COMMON_SRCS-yes += common/reconintra4x4.h
-VP8_COMMON_SRCS-yes += common/rtcd.c
-VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
-VP8_COMMON_SRCS-yes += common/sadmxn.h
-VP8_COMMON_SRCS-yes += common/seg_common.h
-VP8_COMMON_SRCS-yes += common/seg_common.c
-VP8_COMMON_SRCS-yes += common/setupintrarecon.h
-VP8_COMMON_SRCS-yes += common/subpixel.h
-VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
-VP8_COMMON_SRCS-yes += common/systemdependent.h
-VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
-VP8_COMMON_SRCS-yes += common/loopfilter.c
-VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
-VP8_COMMON_SRCS-yes += common/mbpitch.c
-VP8_COMMON_SRCS-yes += common/modecont.c
-VP8_COMMON_SRCS-yes += common/modecontext.c
-VP8_COMMON_SRCS-yes += common/mvref_common.c
-VP8_COMMON_SRCS-yes += common/mvref_common.h
-VP8_COMMON_SRCS-yes += common/quant_common.c
-VP8_COMMON_SRCS-yes += common/recon.c
-VP8_COMMON_SRCS-yes += common/reconinter.c
-VP8_COMMON_SRCS-yes += common/reconintra.c
-VP8_COMMON_SRCS-yes += common/reconintra4x4.c
-VP8_COMMON_SRCS-yes += common/setupintrarecon.c
-VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
-VP8_COMMON_SRCS-yes += common/treecoder.c
-VP8_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c
-
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-ifeq ($(CONFIG_POSTPROC),yes)
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
-endif
-
-# common (c)
-ifeq ($(CONFIG_CSM),yes)
-VP8_COMMON_SRCS-yes += common/maskingmv.c
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
-endif
-
-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c
-ifeq ($(HAVE_SSE4_1),yes)
-vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4
-endif
-
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
-vp8/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
-vp8/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2
-endif
-
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/idct_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/recon_arm.h
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/subpixel_arm.h
-
-# common (armv6)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
-
-# common (neon)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c
--- a/vp8/vp8_cx_iface.c
+++ /dev/null
@@ -1,1169 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx/vpx_codec.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
-#include "vp8/encoder/onyx_int.h"
-#include "vpx/vp8e.h"
-#include "vp8/encoder/firstpass.h"
-#include "vp8/common/onyx.h"
-#include <stdlib.h>
-#include <string.h>
-
-/* This value is a sentinel for determining whether the user has set a mode
- * directly through the deprecated VP8E_SET_ENCODING_MODE control.
- */
-#define NO_MODE_SET 255
-
-struct vp8_extracfg {
- struct vpx_codec_pkt_list *pkt_list;
- vp8e_encoding_mode encoding_mode; /** best, good, realtime */
- int cpu_used; /** available cpu percentage in 1/16*/
- unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */
- unsigned int noise_sensitivity;
- unsigned int Sharpness;
- unsigned int static_thresh;
- unsigned int token_partitions;
- unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */
- unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */
- unsigned int arnr_type; /* alt_ref filter type */
- unsigned int experimental;
- vp8e_tuning tuning;
- unsigned int cq_level; /* constrained quality level */
- unsigned int rc_max_intra_bitrate_pct;
-
-};
-
-struct extraconfig_map {
- int usage;
- struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
- {
- 0,
- {
- NULL,
- VP8_BEST_QUALITY_ENCODING, /* Encoding Mode */
- 0, /* cpu_used */
- 0, /* enable_auto_alt_ref */
- 0, /* noise_sensitivity */
- 0, /* Sharpness */
- 0, /* static_thresh */
- VP8_ONE_TOKENPARTITION, /* token_partitions */
- 0, /* arnr_max_frames */
- 3, /* arnr_strength */
- 3, /* arnr_type*/
- 0, /* experimental mode */
- 0, /* tuning*/
- 10, /* cq_level */
- 0, /* rc_max_intra_bitrate_pct */
- }
- }
-};
-
-struct vpx_codec_alg_priv {
- vpx_codec_priv_t base;
- vpx_codec_enc_cfg_t cfg;
- struct vp8_extracfg vp8_cfg;
- VP9_CONFIG oxcf;
- VP9_PTR cpi;
- unsigned char *cx_data;
- unsigned int cx_data_sz;
- vpx_image_t preview_img;
- unsigned int next_frame_flag;
- vp8_postproc_cfg_t preview_ppcfg;
- vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed
- int deprecated_mode;
- unsigned int fixed_kf_cntr;
-};
-
-
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t *ctx,
- const struct vpx_internal_error_info *error) {
- vpx_codec_err_t res;
-
- if ((res = error->error_code))
- ctx->base.err_detail = error->has_detail
- ? error->detail
- : NULL;
-
- return res;
-}
-
-
-#undef ERROR
-#define ERROR(str) do {\
- ctx->base.err_detail = str;\
- return VPX_CODEC_INVALID_PARAM;\
- } while(0)
-
-#define RANGE_CHECK(p,memb,lo,hi) do {\
- if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
- ERROR(#memb " out of range ["#lo".."#hi"]");\
- } while(0)
-
-#define RANGE_CHECK_HI(p,memb,hi) do {\
- if(!((p)->memb <= (hi))) \
- ERROR(#memb " out of range [.."#hi"]");\
- } while(0)
-
-#define RANGE_CHECK_LO(p,memb,lo) do {\
- if(!((p)->memb >= (lo))) \
- ERROR(#memb " out of range ["#lo"..]");\
- } while(0)
-
-#define RANGE_CHECK_BOOL(p,memb) do {\
- if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
- } while(0)
-
-static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
- const vpx_codec_enc_cfg_t *cfg,
- const struct vp8_extracfg *vp8_cfg) {
- RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */
- RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */
- RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
- RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
- RANGE_CHECK_HI(cfg, g_profile, 3);
- RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
- RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
- RANGE_CHECK_HI(cfg, g_threads, 64);
- RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
- RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ);
- RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
- RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
- RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
- RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
- // RANGE_CHECK_BOOL(cfg, g_delete_firstpassfile);
- RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
- RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
- RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
- RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
- RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
-
- /* VP8 does not support a lower bound on the keyframe interval in
- * automatic keyframe placement mode.
- */
- if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
- && cfg->kf_min_dist > 0)
- ERROR("kf_min_dist not supported in auto mode, use 0 "
- "or kf_max_dist instead.");
-
- RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref);
- RANGE_CHECK(vp8_cfg, cpu_used, -16, 16);
-
- RANGE_CHECK(vp8_cfg, encoding_mode, VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
- RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6);
-
- RANGE_CHECK(vp8_cfg, token_partitions, VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
- RANGE_CHECK_HI(vp8_cfg, Sharpness, 7);
- RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
- RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6);
- RANGE_CHECK(vp8_cfg, arnr_type, 1, 3);
- RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
-
- if (cfg->g_pass == VPX_RC_LAST_PASS) {
- size_t packet_sz = sizeof(FIRSTPASS_STATS);
- int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
- FIRSTPASS_STATS *stats;
-
- if (!cfg->rc_twopass_stats_in.buf)
- ERROR("rc_twopass_stats_in.buf not set.");
-
- if (cfg->rc_twopass_stats_in.sz % packet_sz)
- ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
-
- if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
- ERROR("rc_twopass_stats_in requires at least two packets.");
-
- stats = (void *)((char *)cfg->rc_twopass_stats_in.buf
- + (n_packets - 1) * packet_sz);
-
- if ((int)(stats->count + 0.5) != n_packets - 1)
- ERROR("rc_twopass_stats_in missing EOS stats packet");
- }
-
- return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
- const vpx_image_t *img) {
- switch (img->fmt) {
- case VPX_IMG_FMT_YV12:
- case VPX_IMG_FMT_I420:
- case VPX_IMG_FMT_VPXI420:
- case VPX_IMG_FMT_VPXYV12:
- break;
- default:
- ERROR("Invalid image format. Only YV12 and I420 images are supported");
- }
-
- if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
- ERROR("Image size must match encoder init configuration size");
-
- return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
- vpx_codec_enc_cfg_t cfg,
- struct vp8_extracfg vp8_cfg) {
- oxcf->Version = cfg.g_profile;
- oxcf->Version |= vp8_cfg.experimental ? 0x4 : 0;
-
- oxcf->Width = cfg.g_w;
- oxcf->Height = cfg.g_h;
- /* guess a frame rate if out of whack, use 30 */
- oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
-
- if (oxcf->frame_rate > 180) {
- oxcf->frame_rate = 30;
- }
-
- switch (cfg.g_pass) {
- case VPX_RC_ONE_PASS:
- oxcf->Mode = MODE_BESTQUALITY;
- break;
- case VPX_RC_FIRST_PASS:
- oxcf->Mode = MODE_FIRSTPASS;
- break;
- case VPX_RC_LAST_PASS:
- oxcf->Mode = MODE_SECONDPASS_BEST;
- break;
- }
-
- if (cfg.g_pass == VPX_RC_FIRST_PASS) {
- oxcf->allow_lag = 0;
- oxcf->lag_in_frames = 0;
- } else {
- oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
- oxcf->lag_in_frames = cfg.g_lag_in_frames;
- }
-
- // VBR only supported for now.
- // CBR code has been deprectated for experimental phase.
- // CQ mode not yet tested
- oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
- /*if (cfg.rc_end_usage == VPX_CQ)
- oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
- else
- oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;*/
-
- oxcf->target_bandwidth = cfg.rc_target_bitrate;
- oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
-
- oxcf->best_allowed_q = cfg.rc_min_quantizer;
- oxcf->worst_allowed_q = cfg.rc_max_quantizer;
- oxcf->cq_level = vp8_cfg.cq_level;
- oxcf->fixed_q = -1;
-
- oxcf->under_shoot_pct = cfg.rc_undershoot_pct;
- oxcf->over_shoot_pct = cfg.rc_overshoot_pct;
-
- oxcf->maximum_buffer_size = cfg.rc_buf_sz;
- oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
- oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
-
- oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
- oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
- oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
-
- oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO
- && cfg.kf_min_dist != cfg.kf_max_dist;
- // oxcf->kf_min_dist = cfg.kf_min_dis;
- oxcf->key_freq = cfg.kf_max_dist;
-
- // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
- // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
-
- oxcf->cpu_used = vp8_cfg.cpu_used;
- oxcf->encode_breakout = vp8_cfg.static_thresh;
- oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
- oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
- oxcf->Sharpness = vp8_cfg.Sharpness;
-
- oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
- oxcf->output_pkt_list = vp8_cfg.pkt_list;
-
- oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
- oxcf->arnr_strength = vp8_cfg.arnr_strength;
- oxcf->arnr_type = vp8_cfg.arnr_type;
-
- oxcf->tuning = vp8_cfg.tuning;
-
-#if CONFIG_LOSSLESS
- oxcf->lossless = cfg.lossless;
-#endif
-
- /*
- printf("Current VP8 Settings: \n");
- printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
- printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
- printf("Sharpness: %d\n", oxcf->Sharpness);
- printf("cpu_used: %d\n", oxcf->cpu_used);
- printf("Mode: %d\n", oxcf->Mode);
- printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file);
- printf("auto_key: %d\n", oxcf->auto_key);
- printf("key_freq: %d\n", oxcf->key_freq);
- printf("end_usage: %d\n", oxcf->end_usage);
- printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
- printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
- printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
- printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level);
- printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
- printf("fixed_q: %d\n", oxcf->fixed_q);
- printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
- printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
- printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias);
- printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
- printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
- printf("allow_lag: %d\n", oxcf->allow_lag);
- printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
- printf("play_alternate: %d\n", oxcf->play_alternate);
- printf("Version: %d\n", oxcf->Version);
- printf("encode_breakout: %d\n", oxcf->encode_breakout);
- */
- return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
- const vpx_codec_enc_cfg_t *cfg) {
- vpx_codec_err_t res;
-
- if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
- ERROR("Cannot change width or height after initialization");
-
- /* Prevent increasing lag_in_frames. This check is stricter than it needs
- * to be -- the limit is not increasing past the first lag_in_frames
- * value, but we don't track the initial config, only the last successful
- * config.
- */
- if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
- ERROR("Cannot increase lag_in_frames");
-
- res = validate_config(ctx, cfg, &ctx->vp8_cfg);
-
- if (!res) {
- ctx->cfg = *cfg;
- set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
- vp9_change_config(ctx->cpi, &ctx->oxcf);
- }
-
- return res;
-}
-
-
-int vp9_reverse_trans(int q);
-
-
-static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
- void *arg = va_arg(args, void *);
-
-#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
-
- if (!arg)
- return VPX_CODEC_INVALID_PARAM;
-
- switch (ctrl_id) {
- MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
- MAP(VP8E_GET_LAST_QUANTIZER_64,
- vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));
- }
-
- return VPX_CODEC_OK;
-#undef MAP
-}
-
-
-static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
- vpx_codec_err_t res = VPX_CODEC_OK;
- struct vp8_extracfg xcfg = ctx->vp8_cfg;
-
-#define MAP(id, var) case id: var = CAST(id, args); break;
-
- switch (ctrl_id) {
- MAP(VP8E_SET_ENCODING_MODE, ctx->deprecated_mode);
- MAP(VP8E_SET_CPUUSED, xcfg.cpu_used);
- MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref);
- MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity);
- MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness);
- MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh);
- MAP(VP8E_SET_TOKEN_PARTITIONS, xcfg.token_partitions);
-
- MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames);
- MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength);
- MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type);
- MAP(VP8E_SET_TUNING, xcfg.tuning);
- MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level);
- MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
-
- }
-
- res = validate_config(ctx, &ctx->cfg, &xcfg);
-
- if (!res) {
- ctx->vp8_cfg = xcfg;
- set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
- vp9_change_config(ctx->cpi, &ctx->oxcf);
- }
-
- return res;
-#undef MAP
-}
-
-
-static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,
- int experimental) {
- vpx_codec_err_t res = VPX_DEC_OK;
- struct vpx_codec_alg_priv *priv;
- vpx_codec_enc_cfg_t *cfg;
- unsigned int i;
-
- VP9_PTR optr;
-
- if (!ctx->priv) {
- priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
-
- if (!priv) {
- return VPX_CODEC_MEM_ERROR;
- }
-
- ctx->priv = &priv->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = priv;
- ctx->priv->init_flags = ctx->init_flags;
-
- if (ctx->config.enc) {
- /* Update the reference to the config structure to an
- * internal copy.
- */
- ctx->priv->alg_priv->cfg = *ctx->config.enc;
- ctx->config.enc = &ctx->priv->alg_priv->cfg;
- }
-
- cfg = &ctx->priv->alg_priv->cfg;
-
- /* Select the extra vp6 configuration table based on the current
- * usage value. If the current usage value isn't found, use the
- * values for usage case 0.
- */
- for (i = 0;
- extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- i++);
-
- priv->vp8_cfg = extracfg_map[i].cfg;
- priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
- priv->vp8_cfg.experimental = experimental;
-
- priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
-
- if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
-
- priv->cx_data = malloc(priv->cx_data_sz);
-
- if (!priv->cx_data) {
- return VPX_CODEC_MEM_ERROR;
- }
-
- priv->deprecated_mode = NO_MODE_SET;
-
- vp9_initialize_enc();
-
- res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
-
- if (!res) {
- set_vp8e_config(&ctx->priv->alg_priv->oxcf,
- ctx->priv->alg_priv->cfg,
- ctx->priv->alg_priv->vp8_cfg);
- optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-
- if (!optr)
- res = VPX_CODEC_MEM_ERROR;
- else
- ctx->priv->alg_priv->cpi = optr;
- }
- }
-
- return res;
-}
-
-
-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {
- return vp8e_common_init(ctx, 0);
-}
-
-
-#if CONFIG_EXPERIMENTAL
-static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {
- return vp8e_common_init(ctx, 1);
-}
-#endif
-
-
-static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
-
- free(ctx->cx_data);
- vp9_remove_compressor(&ctx->cpi);
- free(ctx);
- return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
- YV12_BUFFER_CONFIG *yv12) {
- vpx_codec_err_t res = VPX_CODEC_OK;
- yv12->y_buffer = img->planes[VPX_PLANE_Y];
- yv12->u_buffer = img->planes[VPX_PLANE_U];
- yv12->v_buffer = img->planes[VPX_PLANE_V];
-
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
- yv12->uv_width = (1 + yv12->y_width) / 2;
- yv12->uv_height = (1 + yv12->y_height) / 2;
-
- yv12->y_stride = img->stride[VPX_PLANE_Y];
- yv12->uv_stride = img->stride[VPX_PLANE_U];
-
- yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
- yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
- return res;
-}
-
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
- unsigned long duration,
- unsigned long deadline) {
- unsigned int new_qc;
-
- /* Use best quality mode if no deadline is given. */
- if (deadline)
- new_qc = MODE_GOODQUALITY;
- else
- new_qc = MODE_BESTQUALITY;
-
- if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
- new_qc = MODE_FIRSTPASS;
- else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
- new_qc = (new_qc == MODE_BESTQUALITY)
- ? MODE_SECONDPASS_BEST
- : MODE_SECONDPASS;
-
- if (ctx->oxcf.Mode != new_qc) {
- ctx->oxcf.Mode = new_qc;
- vp9_change_config(ctx->cpi, &ctx->oxcf);
- }
-}
-
-
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned long duration,
- vpx_enc_frame_flags_t flags,
- unsigned long deadline) {
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- if (img)
- res = validate_img(ctx, img);
-
- pick_quickcompress_mode(ctx, duration, deadline);
- vpx_codec_pkt_list_init(&ctx->pkt_list);
-
- /* Handle Flags */
- if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
- || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
- ctx->base.err_detail = "Conflicting flags.";
- return VPX_CODEC_INVALID_PARAM;
- }
-
- if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
- | VP8_EFLAG_NO_REF_ARF)) {
- int ref = 7;
-
- if (flags & VP8_EFLAG_NO_REF_LAST)
- ref ^= VP9_LAST_FLAG;
-
- if (flags & VP8_EFLAG_NO_REF_GF)
- ref ^= VP9_GOLD_FLAG;
-
- if (flags & VP8_EFLAG_NO_REF_ARF)
- ref ^= VP9_ALT_FLAG;
-
- vp9_use_as_reference(ctx->cpi, ref);
- }
-
- if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
- | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
- | VP8_EFLAG_FORCE_ARF)) {
- int upd = 7;
-
- if (flags & VP8_EFLAG_NO_UPD_LAST)
- upd ^= VP9_LAST_FLAG;
-
- if (flags & VP8_EFLAG_NO_UPD_GF)
- upd ^= VP9_GOLD_FLAG;
-
- if (flags & VP8_EFLAG_NO_UPD_ARF)
- upd ^= VP9_ALT_FLAG;
-
- vp9_update_reference(ctx->cpi, upd);
- }
-
- if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
- vp9_update_entropy(ctx->cpi, 0);
- }
-
- /* Handle fixed keyframe intervals */
- if (ctx->cfg.kf_mode == VPX_KF_AUTO
- && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
- if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
- flags |= VPX_EFLAG_FORCE_KF;
- ctx->fixed_kf_cntr = 1;
- }
- }
-
- /* Initialize the encoder instance on the first frame*/
- if (!res && ctx->cpi) {
- unsigned int lib_flags;
- YV12_BUFFER_CONFIG sd;
- int64_t dst_time_stamp, dst_end_time_stamp;
- unsigned long size, cx_data_sz;
- unsigned char *cx_data;
-
- /* Set up internal flags */
- if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
- ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
-
- // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
- // ((VP9_COMP *)ctx->cpi)->output_partition = 1;
-
- /* Convert API flags to internal codec lib flags */
- lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
- /* vp8 use 10,000,000 ticks/second as time stamp */
- dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
- dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-
- if (img != NULL) {
- res = image2yuvconfig(img, &sd);
-
- if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
- &sd, dst_time_stamp, dst_end_time_stamp)) {
- VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
- res = update_error_state(ctx, &cpi->common.error);
- }
-
- /* reset for next frame */
- ctx->next_frame_flag = 0;
- }
-
- cx_data = ctx->cx_data;
- cx_data_sz = ctx->cx_data_sz;
- lib_flags = 0;
-
- while (cx_data_sz >= ctx->cx_data_sz / 2 &&
- -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
- cx_data, &dst_time_stamp,
- &dst_end_time_stamp, !img)) {
- if (size) {
- vpx_codec_pts_t round, delta;
- vpx_codec_cx_pkt_t pkt;
- VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
-
- /* Add the frame packet to the list of returned packets. */
- round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
- delta = (dst_end_time_stamp - dst_time_stamp);
- pkt.kind = VPX_CODEC_CX_FRAME_PKT;
- pkt.data.frame.pts =
- (dst_time_stamp * ctx->cfg.g_timebase.den + round)
- / ctx->cfg.g_timebase.num / 10000000;
- pkt.data.frame.duration =
- (delta * ctx->cfg.g_timebase.den + round)
- / ctx->cfg.g_timebase.num / 10000000;
- pkt.data.frame.flags = lib_flags << 16;
-
- if (lib_flags & FRAMEFLAGS_KEY)
- pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
- if (!cpi->common.show_frame) {
- pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
- // This timestamp should be as close as possible to the
- // prior PTS so that if a decoder uses pts to schedule when
- // to do this, we start right after last frame was decoded.
- // Invisible frames have no duration.
- pkt.data.frame.pts = ((cpi->last_time_stamp_seen
- * ctx->cfg.g_timebase.den + round)
- / ctx->cfg.g_timebase.num / 10000000) + 1;
- pkt.data.frame.duration = 0;
- }
-
- if (cpi->droppable)
- pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
-
- /*if (cpi->output_partition)
- {
- int i;
- const int num_partitions = 1;
-
- pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
-
- for (i = 0; i < num_partitions; ++i)
- {
- pkt.data.frame.buf = cx_data;
- pkt.data.frame.sz = cpi->partition_sz[i];
- pkt.data.frame.partition_id = i;
- // don't set the fragment bit for the last partition
- if (i == (num_partitions - 1))
- pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
- vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
- cx_data += cpi->partition_sz[i];
- cx_data_sz -= cpi->partition_sz[i];
- }
- }
- else*/
- {
- pkt.data.frame.buf = cx_data;
- pkt.data.frame.sz = size;
- pkt.data.frame.partition_id = -1;
- vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
- cx_data += size;
- cx_data_sz -= size;
- }
-
- // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
- }
- }
- }
-
- return res;
-}
-
-
-static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t *ctx,
- vpx_codec_iter_t *iter) {
- return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
-}
-
-static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
- if (data) {
- vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
- YV12_BUFFER_CONFIG sd;
-
- image2yuvconfig(&frame->img, &sd);
- vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-
- vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
- if (data) {
- vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
- YV12_BUFFER_CONFIG sd;
-
- image2yuvconfig(&frame->img, &sd);
- vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
-
-static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-#if CONFIG_POSTPROC
- vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
- (void)ctr_id;
-
- if (data) {
- ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-#else
- (void)ctx;
- (void)ctr_id;
- (void)args;
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-
-static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
- YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = {0};
-
- if (ctx->preview_ppcfg.post_proc_flag) {
- flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag;
- flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
- flags.noise_level = ctx->preview_ppcfg.noise_level;
- }
-
- if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
-
- /*
- vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
- sd.y_width + 2*VP8BORDERINPIXELS,
- sd.y_height + 2*VP8BORDERINPIXELS,
- 1,
- sd.buffer_alloc);
- vpx_img_set_rect(&ctx->preview_img,
- VP8BORDERINPIXELS, VP8BORDERINPIXELS,
- sd.y_width, sd.y_height);
- */
-
- ctx->preview_img.bps = 12;
- ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
- ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
- ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
-
- if (sd.clrtype == REG_YUV)
- ctx->preview_img.fmt = VPX_IMG_FMT_I420;
- else
- ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
- ctx->preview_img.x_chroma_shift = 1;
- ctx->preview_img.y_chroma_shift = 1;
-
- ctx->preview_img.d_w = sd.y_width;
- ctx->preview_img.d_h = sd.y_height;
- ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
- ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
- ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
- ctx->preview_img.w = sd.y_width;
- ctx->preview_img.h = sd.y_height;
-
- return &ctx->preview_img;
- } else
- return NULL;
-}
-
-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- int update = va_arg(args, int);
- vp9_update_entropy(ctx->cpi, update);
- return VPX_CODEC_OK;
-
-}
-
-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- int update = va_arg(args, int);
- vp9_update_reference(ctx->cpi, update);
- return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- int reference_flag = va_arg(args, int);
- vp9_use_as_reference(ctx->cpi, reference_flag);
- return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
- if (data) {
- vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
- if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
- roi->delta_q, roi->delta_lf, roi->static_threshold))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
- vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
- if (data) {
-
- vpx_active_map_t *map = (vpx_active_map_t *)data;
-
- if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
-
-static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-
- vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *);
-
- if (data) {
- int res;
- vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
- res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
- scalemode.v_scaling_mode);
-
- if (!res) {
- /*force next frame a key frame to effect scaling mode */
- ctx->next_frame_flag |= FRAMEFLAGS_KEY;
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
- {VP8_SET_REFERENCE, vp8e_set_reference},
- {VP8_COPY_REFERENCE, vp8e_get_reference},
- {VP8_SET_POSTPROC, vp8e_set_previewpp},
- {VP8E_UPD_ENTROPY, vp8e_update_entropy},
- {VP8E_UPD_REFERENCE, vp8e_update_reference},
- {VP8E_USE_REFERENCE, vp8e_use_reference},
- {VP8E_SET_ROI_MAP, vp8e_set_roi_map},
- {VP8E_SET_ACTIVEMAP, vp8e_set_activemap},
- {VP8E_SET_SCALEMODE, vp8e_set_scalemode},
- {VP8E_SET_ENCODING_MODE, set_param},
- {VP8E_SET_CPUUSED, set_param},
- {VP8E_SET_NOISE_SENSITIVITY, set_param},
- {VP8E_SET_ENABLEAUTOALTREF, set_param},
- {VP8E_SET_SHARPNESS, set_param},
- {VP8E_SET_STATIC_THRESHOLD, set_param},
- {VP8E_SET_TOKEN_PARTITIONS, set_param},
- {VP8E_GET_LAST_QUANTIZER, get_param},
- {VP8E_GET_LAST_QUANTIZER_64, get_param},
- {VP8E_SET_ARNR_MAXFRAMES, set_param},
- {VP8E_SET_ARNR_STRENGTH, set_param},
- {VP8E_SET_ARNR_TYPE, set_param},
- {VP8E_SET_TUNING, set_param},
- {VP8E_SET_CQ_LEVEL, set_param},
- {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param},
- { -1, NULL},
-};
-
-static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
- {
- 0,
- {
- 0, /* g_usage */
- 0, /* g_threads */
- 0, /* g_profile */
-
- 320, /* g_width */
- 240, /* g_height */
- {1, 30}, /* g_timebase */
-
- 0, /* g_error_resilient */
-
- VPX_RC_ONE_PASS, /* g_pass */
-
- 0, /* g_lag_in_frames */
-
- 0, /* rc_dropframe_thresh */
- 0, /* rc_resize_allowed */
- 60, /* rc_resize_down_thresold */
- 30, /* rc_resize_up_thresold */
-
- VPX_VBR, /* rc_end_usage */
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
- {0}, /* rc_twopass_stats_in */
-#endif
- 256, /* rc_target_bandwidth */
- 4, /* rc_min_quantizer */
- 63, /* rc_max_quantizer */
- 100, /* rc_undershoot_pct */
- 100, /* rc_overshoot_pct */
-
- 6000, /* rc_max_buffer_size */
- 4000, /* rc_buffer_initial_size; */
- 5000, /* rc_buffer_optimal_size; */
-
- 50, /* rc_two_pass_vbrbias */
- 0, /* rc_two_pass_vbrmin_section */
- 400, /* rc_two_pass_vbrmax_section */
-
- /* keyframing settings (kf) */
- VPX_KF_AUTO, /* g_kfmode*/
- 0, /* kf_min_dist */
- 9999, /* kf_max_dist */
-
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
- 1, /* g_delete_first_pass_file */
- "vp8.fpf" /* first pass filename */
-#endif
- }
- },
- { -1, {NOT_IMPLEMENTED}}
-};
-
-
-#ifndef VERSION_STRING
-#define VERSION_STRING
-#endif
-CODEC_INTERFACE(vpx_codec_vp8_cx) = {
- "WebM Project VP8 Encoder" VERSION_STRING,
- VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
- VPX_CODEC_CAP_OUTPUT_PARTITION,
- /* vpx_codec_caps_t caps; */
- vp8e_init, /* vpx_codec_init_fn_t init; */
- vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
- vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
- NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
- NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
- NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
- NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
- NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
- },
- {
- vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
- vp8e_encode, /* vpx_codec_encode_fn_t encode; */
- vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
- vp8e_set_config,
- NOT_IMPLEMENTED,
- vp8e_get_preview,
- } /* encoder functions */
-};
-
-
-#if CONFIG_EXPERIMENTAL
-
-CODEC_INTERFACE(vpx_codec_vp8x_cx) = {
- "VP8 Experimental Encoder" VERSION_STRING,
- VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
- /* vpx_codec_caps_t caps; */
- vp8e_exp_init, /* vpx_codec_init_fn_t init; */
- vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
- vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
- NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
- NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
- NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
- NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
- NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
- },
- {
- vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
- vp8e_encode, /* vpx_codec_encode_fn_t encode; */
- vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
- vp8e_set_config,
- NOT_IMPLEMENTED,
- vp8e_get_preview,
- } /* encoder functions */
-};
-#endif
-
-
-/*
- * BEGIN BACKWARDS COMPATIBILITY SHIM.
- */
-#define FORCE_KEY 2
-static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
- vpx_codec_ctrl_fn_map_t *entry;
-
- switch (ctrl_id) {
- case VP8E_SET_FLUSHFLAG:
- /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by
- * vpx_codec_get_cx_data() rather than vpx_codec_encode().
- */
- return vp8e_encode(ctx, NULL, 0, 0, 0, 0);
- case VP8E_SET_FRAMETYPE:
- ctx->base.enc.tbd |= FORCE_KEY;
- return VPX_CODEC_OK;
- }
-
- for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {
- if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
- return entry->fn(ctx, ctrl_id, args);
- }
- }
-
- return VPX_CODEC_ERROR;
-}
-
-
-static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {
- {0, api1_control},
- { -1, NULL}
-};
-
-
-static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned long duration,
- vpx_enc_frame_flags_t flags,
- unsigned long deadline) {
- int force = ctx->base.enc.tbd;
-
- ctx->base.enc.tbd = 0;
- return vp8e_encode
- (ctx,
- img,
- pts,
- duration,
- flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),
- deadline);
-}
-
-
-vpx_codec_iface_t vpx_enc_vp8_algo = {
- "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,
- VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_ENCODER,
- /* vpx_codec_caps_t caps; */
- vp8e_init, /* vpx_codec_init_fn_t init; */
- vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
- api1_ctrl_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
- NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {NOT_IMPLEMENTED}, /* decoder functions */
- {
- vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
- api1_encode, /* vpx_codec_encode_fn_t encode; */
- vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
- vp8e_set_config,
- NOT_IMPLEMENTED,
- vp8e_get_preview,
- } /* encoder functions */
-};
--- a/vp8/vp8_dx_iface.c
+++ /dev/null
@@ -1,717 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include <string.h>
-#include "vpx/vpx_decoder.h"
-#include "vpx/vp8dx.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
-#include "common/onyxd.h"
-#include "decoder/onyxd_int.h"
-
-#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
-typedef vpx_codec_stream_info_t vp8_stream_info_t;
-
-/* Structures for handling memory allocations */
-typedef enum {
- VP8_SEG_ALG_PRIV = 256,
- VP8_SEG_MAX
-} mem_seg_id_t;
-#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
-
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
-
-typedef struct {
- unsigned int id;
- unsigned long sz;
- unsigned int align;
- unsigned int flags;
- unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
-static const mem_req_t vp8_mem_req_segs[] = {
- {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
- {VP8_SEG_MAX, 0, 0, 0, NULL}
-};
-
-struct vpx_codec_alg_priv {
- vpx_codec_priv_t base;
- vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
- vpx_codec_dec_cfg_t cfg;
- vp8_stream_info_t si;
- int defer_alloc;
- int decoder_init;
- VP9D_PTR pbi;
- int postproc_cfg_set;
- vp8_postproc_cfg_t postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
- unsigned int dbg_postproc_flag;
- int dbg_color_ref_frame_flag;
- int dbg_color_mb_modes_flag;
- int dbg_color_b_modes_flag;
- int dbg_display_mv_flag;
-#endif
- vpx_image_t img;
- int img_setup;
- int img_avail;
-};
-
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
- vpx_codec_flags_t flags) {
- /* Although this declaration is constant, we can't use it in the requested
- * segments list because we want to define the requested segments list
- * before defining the private type (so that the number of memory maps is
- * known)
- */
- (void)si;
- return sizeof(vpx_codec_alg_priv_t);
-}
-
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
- free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
- vpx_codec_err_t res;
- unsigned int align;
-
- align = mmap->align ? mmap->align - 1 : 0;
-
- if (mmap->flags & VPX_CODEC_MEM_ZERO)
- mmap->priv = calloc(1, mmap->sz + align);
- else
- mmap->priv = malloc(mmap->sz + align);
-
- res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
- mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
- mmap->dtor = vp8_mmap_dtor;
- return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
- const vpx_codec_mmap_t *mmaps,
- vpx_codec_flags_t init_flags) {
- int i;
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
- /* Ensure the segment has been allocated */
- if (!mmaps[i].base) {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
-
- /* Verify variable size segment is big enough for the current si. */
- if (vp8_mem_req_segs[i].calc_sz) {
- vpx_codec_dec_cfg_t cfg;
-
- cfg.w = si->w;
- cfg.h = si->h;
-
- if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
- }
- }
-
- return res;
-}
-
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
- int i;
-
- ctx->priv = mmap->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = mmap->base;
-
- for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
- ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
-
- ctx->priv->alg_priv->mmaps[0] = *mmap;
- ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
- ctx->priv->init_flags = ctx->init_flags;
-
- if (ctx->config.dec) {
- /* Update the reference to the config structure to an internal copy. */
- ctx->priv->alg_priv->cfg = *ctx->config.dec;
- ctx->config.dec = &ctx->priv->alg_priv->cfg;
- }
-}
-
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
- int i;
-
- for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
- if (ctx->mmaps[i].id == id)
- return ctx->mmaps[i].base;
-
- return NULL;
-}
-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
- /* nothing to clean up */
-}
-
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- /* This function only allocates space for the vpx_codec_alg_priv_t
- * structure. More memory may be required at the time the stream
- * information becomes known.
- */
- if (!ctx->priv) {
- vpx_codec_mmap_t mmap;
-
- mmap.id = vp8_mem_req_segs[0].id;
- mmap.sz = sizeof(vpx_codec_alg_priv_t);
- mmap.align = vp8_mem_req_segs[0].align;
- mmap.flags = vp8_mem_req_segs[0].flags;
-
- res = vp8_mmap_alloc(&mmap);
-
- if (!res) {
- vp8_init_ctx(ctx, &mmap);
-
- ctx->priv->alg_priv->defer_alloc = 1;
- /*post processing level initialized to do nothing */
- }
- }
-
- return res;
-}
-
-static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
- int i;
-
- vp9_remove_decompressor(ctx->pbi);
-
- for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {
- if (ctx->mmaps[i].dtor)
- ctx->mmaps[i].dtor(&ctx->mmaps[i]);
- }
-
- return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
- unsigned int data_sz,
- vpx_codec_stream_info_t *si) {
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- if (data + data_sz <= data)
- res = VPX_CODEC_INVALID_PARAM;
- else {
- /* Parse uncompresssed part of key frame header.
- * 3 bytes:- including version, frame type and an offset
- * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
- * 4 bytes:- including image width and height in the lowest 14 bits
- * of each 2-byte value.
- */
- si->is_kf = 0;
-
- if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
- const uint8_t *c = data + 3;
- si->is_kf = 1;
-
- /* vet via sync code */
- if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
- res = VPX_CODEC_UNSUP_BITSTREAM;
-
- si->w = (c[3] | (c[4] << 8)) & 0x3fff;
- si->h = (c[5] | (c[6] << 8)) & 0x3fff;
-
- /*printf("w=%d, h=%d\n", si->w, si->h);*/
- if (!(si->h | si->w))
- res = VPX_CODEC_UNSUP_BITSTREAM;
- } else
- res = VPX_CODEC_UNSUP_BITSTREAM;
- }
-
- return res;
-
-}
-
-static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx,
- vpx_codec_stream_info_t *si) {
-
- unsigned int sz;
-
- if (si->sz >= sizeof(vp8_stream_info_t))
- sz = sizeof(vp8_stream_info_t);
- else
- sz = sizeof(vpx_codec_stream_info_t);
-
- memcpy(si, &ctx->si, sz);
- si->sz = sz;
-
- return VPX_CODEC_OK;
-}
-
-
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t *ctx,
- const struct vpx_internal_error_info *error) {
- vpx_codec_err_t res;
-
- if ((res = error->error_code))
- ctx->base.err_detail = error->has_detail
- ? error->detail
- : NULL;
-
- return res;
-}
-
-static void yuvconfig2image(vpx_image_t *img,
- const YV12_BUFFER_CONFIG *yv12,
- void *user_priv) {
- /** vpx_img_wrap() doesn't allow specifying independent strides for
- * the Y, U, and V planes, nor other alignment adjustments that
- * might be representable by a YV12_BUFFER_CONFIG, so we just
- * initialize all the fields.*/
- img->fmt = yv12->clrtype == REG_YUV ?
- VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
- img->w = yv12->y_stride;
- img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
- img->d_w = yv12->y_width;
- img->d_h = yv12->y_height;
- img->x_chroma_shift = 1;
- img->y_chroma_shift = 1;
- img->planes[VPX_PLANE_Y] = yv12->y_buffer;
- img->planes[VPX_PLANE_U] = yv12->u_buffer;
- img->planes[VPX_PLANE_V] = yv12->v_buffer;
- img->planes[VPX_PLANE_ALPHA] = NULL;
- img->stride[VPX_PLANE_Y] = yv12->y_stride;
- img->stride[VPX_PLANE_U] = yv12->uv_stride;
- img->stride[VPX_PLANE_V] = yv12->uv_stride;
- img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
- img->bps = 12;
- img->user_priv = user_priv;
- img->img_data = yv12->buffer_alloc;
- img->img_data_owner = 0;
- img->self_allocd = 0;
-}
-
-static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
- const uint8_t *data,
- unsigned int data_sz,
- void *user_priv,
- long deadline) {
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- ctx->img_avail = 0;
-
- /* Determine the stream parameters. Note that we rely on peek_si to
- * validate that we have a buffer that does not wrap around the top
- * of the heap.
- */
- if (!ctx->si.h)
- res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
-
-
- /* Perform deferred allocations, if required */
- if (!res && ctx->defer_alloc) {
- int i;
-
- for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {
- vpx_codec_dec_cfg_t cfg;
-
- cfg.w = ctx->si.w;
- cfg.h = ctx->si.h;
- ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
- ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
- ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
-
- if (!ctx->mmaps[i].sz)
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
- ctx->base.init_flags);
-
- res = vp8_mmap_alloc(&ctx->mmaps[i]);
- }
-
- if (!res)
- vp8_finalize_mmaps(ctx);
-
- ctx->defer_alloc = 0;
- }
-
- /* Initialize the decoder instance on the first frame*/
- if (!res && !ctx->decoder_init) {
- res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
-
- if (!res) {
- VP9D_CONFIG oxcf;
- VP9D_PTR optr;
-
- vp9_initialize_dec();
-
- oxcf.Width = ctx->si.w;
- oxcf.Height = ctx->si.h;
- oxcf.Version = 9;
- oxcf.postprocess = 0;
- oxcf.max_threads = ctx->cfg.threads;
- optr = vp9_create_decompressor(&oxcf);
-
- /* If postprocessing was enabled by the application and a
- * configuration has not been provided, default it.
- */
- if (!ctx->postproc_cfg_set
- && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
- ctx->postproc_cfg.post_proc_flag =
- VP8_DEBLOCK | VP8_DEMACROBLOCK;
- ctx->postproc_cfg.deblocking_level = 4;
- ctx->postproc_cfg.noise_level = 0;
- }
-
- if (!optr)
- res = VPX_CODEC_ERROR;
- else
- ctx->pbi = optr;
- }
-
- ctx->decoder_init = 1;
- }
-
- if (!res && ctx->pbi) {
- YV12_BUFFER_CONFIG sd;
- int64_t time_stamp = 0, time_end_stamp = 0;
- vp9_ppflags_t flags = {0};
-
- if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
- flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
-#if CONFIG_POSTPROC_VISUALIZER
-
- | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
- | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
-#endif
-;
- flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
- flags.noise_level = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
- flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
- flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
- flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
- flags.display_mv_flag = ctx->dbg_display_mv_flag;
-#endif
- }
-
- if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {
- VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
- res = update_error_state(ctx, &pbi->common.error);
- }
-
- if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
- &time_end_stamp, &flags)) {
- yuvconfig2image(&ctx->img, &sd, user_priv);
- ctx->img_avail = 1;
- }
- }
-
- return res;
-}
-
-static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
- vpx_codec_iter_t *iter) {
- vpx_image_t *img = NULL;
-
- if (ctx->img_avail) {
- /* iter acts as a flip flop, so an image is only returned on the first
- * call to get_frame.
- */
- if (!(*iter)) {
- img = &ctx->img;
- *iter = img;
- }
- }
-
- return img;
-}
-
-
-static
-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
- vpx_codec_mmap_t *mmap,
- vpx_codec_iter_t *iter) {
- vpx_codec_err_t res;
- const mem_req_t *seg_iter = *iter;
-
- /* Get address of next segment request */
- do {
- if (!seg_iter)
- seg_iter = vp8_mem_req_segs;
- else if (seg_iter->id != VP8_SEG_MAX)
- seg_iter++;
-
- *iter = (vpx_codec_iter_t)seg_iter;
-
- if (seg_iter->id != VP8_SEG_MAX) {
- mmap->id = seg_iter->id;
- mmap->sz = seg_iter->sz;
- mmap->align = seg_iter->align;
- mmap->flags = seg_iter->flags;
-
- if (!seg_iter->sz)
- mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
-
- res = VPX_CODEC_OK;
- } else
- res = VPX_CODEC_LIST_END;
- } while (!mmap->sz && res != VPX_CODEC_LIST_END);
-
- return res;
-}
-
-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
- const vpx_codec_mmap_t *mmap) {
- vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
- int i, done;
-
- if (!ctx->priv) {
- if (mmap->id == VP8_SEG_ALG_PRIV) {
- if (!ctx->priv) {
- vp8_init_ctx(ctx, mmap);
- res = VPX_CODEC_OK;
- }
- }
- }
-
- done = 1;
-
- if (!res && ctx->priv->alg_priv) {
- for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {
- if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
- if (!ctx->priv->alg_priv->mmaps[i].base) {
- ctx->priv->alg_priv->mmaps[i] = *mmap;
- res = VPX_CODEC_OK;
- }
-
- done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
- }
- }
-
- if (done && !res) {
- vp8_finalize_mmaps(ctx->priv->alg_priv);
- res = ctx->iface->init(ctx);
- }
-
- return res;
-}
-
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
- YV12_BUFFER_CONFIG *yv12) {
- vpx_codec_err_t res = VPX_CODEC_OK;
- yv12->y_buffer = img->planes[VPX_PLANE_Y];
- yv12->u_buffer = img->planes[VPX_PLANE_U];
- yv12->v_buffer = img->planes[VPX_PLANE_V];
-
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
- yv12->uv_width = yv12->y_width / 2;
- yv12->uv_height = yv12->y_height / 2;
-
- yv12->y_stride = img->stride[VPX_PLANE_Y];
- yv12->uv_stride = img->stride[VPX_PLANE_U];
-
- yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
- yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
- img->fmt == VPX_IMG_FMT_VPXYV12);
-
- return res;
-}
-
-
-static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-
- vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
- if (data) {
- vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
- YV12_BUFFER_CONFIG sd;
-
- image2yuvconfig(&frame->img, &sd);
-
- return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);
- } else
- return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-
- vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
-
- if (data) {
- vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
- YV12_BUFFER_CONFIG sd;
-
- image2yuvconfig(&frame->img, &sd);
-
- return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);
- } else
- return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
-#if CONFIG_POSTPROC
- vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
-
- if (data) {
- ctx->postproc_cfg_set = 1;
- ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-
-#else
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- int data = va_arg(args, int);
-
-#define MAP(id, var) case id: var = data; break;
-
- switch (ctrl_id) {
- MAP(VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag);
- MAP(VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag);
- MAP(VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag);
- MAP(VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag);
- }
-
- return VPX_CODEC_OK;
-#else
- return VPX_CODEC_INCAPABLE;
-#endif
-}
-
-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
- int *update_info = va_arg(args, int *);
- VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-
- if (update_info) {
- *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
- + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
- + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
-
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-}
-
-
-static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
-
- int *corrupted = va_arg(args, int *);
-
- if (corrupted) {
- VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
- *corrupted = pbi->common.frame_to_show->corrupted;
-
- return VPX_CODEC_OK;
- } else
- return VPX_CODEC_INVALID_PARAM;
-
-}
-
-static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
- {VP8_SET_REFERENCE, vp9_set_reference},
- {VP8_COPY_REFERENCE, vp9_get_reference},
- {VP8_SET_POSTPROC, vp8_set_postproc},
- {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options},
- {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options},
- {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options},
- {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options},
- {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
- {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
- { -1, NULL},
-};
-
-
-#ifndef VERSION_STRING
-#define VERSION_STRING
-#endif
-CODEC_INTERFACE(vpx_codec_vp8_dx) = {
- "WebM Project VP8 Decoder" VERSION_STRING,
- VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |
- VPX_CODEC_CAP_INPUT_PARTITION,
- /* vpx_codec_caps_t caps; */
- vp8_init, /* vpx_codec_init_fn_t init; */
- vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
- ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
- vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
- vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
- vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
- vp8_decode, /* vpx_codec_decode_fn_t decode; */
- vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
- },
- {
- /* encoder functions */
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED
- }
-};
-
-/*
- * BEGIN BACKWARDS COMPATIBILITY SHIM.
- */
-vpx_codec_iface_t vpx_codec_vp8_algo = {
- "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,
- VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
- /* vpx_codec_caps_t caps; */
- vp8_init, /* vpx_codec_init_fn_t init; */
- vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
- ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
- vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
- vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
- vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
- vp8_decode, /* vpx_codec_decode_fn_t decode; */
- vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
- },
- {
- /* encoder functions */
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED
- }
-};
--- a/vp8/vp8cx.mk
+++ /dev/null
@@ -1,120 +1,0 @@
-##
-## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-
-
-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
-
-VP8_CX_EXPORTS += exports_enc
-
-VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
-VP8_CX_SRCS-no += $(VP8_COMMON_SRCS-no)
-VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
-VP8_CX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
-
-ifeq ($(ARCH_ARM),yes)
- include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
-endif
-
-VP8_CX_SRCS-yes += vp8_cx_iface.c
-
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
-
-VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
-VP8_CX_SRCS-yes += encoder/bitstream.c
-VP8_CX_SRCS-yes += encoder/boolhuff.c
-VP8_CX_SRCS-yes += encoder/dct.c
-VP8_CX_SRCS-yes += encoder/encodeframe.c
-VP8_CX_SRCS-yes += encoder/encodeintra.c
-VP8_CX_SRCS-yes += encoder/encodemb.c
-VP8_CX_SRCS-yes += encoder/encodemv.c
-VP8_CX_SRCS-yes += encoder/firstpass.c
-VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c
-VP8_CX_SRCS-yes += encoder/block.h
-VP8_CX_SRCS-yes += encoder/boolhuff.h
-VP8_CX_SRCS-yes += encoder/bitstream.h
-VP8_CX_SRCS-yes += encoder/encodeintra.h
-VP8_CX_SRCS-yes += encoder/encodemb.h
-VP8_CX_SRCS-yes += encoder/encodemv.h
-VP8_CX_SRCS-yes += encoder/firstpass.h
-VP8_CX_SRCS-yes += encoder/lookahead.c
-VP8_CX_SRCS-yes += encoder/lookahead.h
-VP8_CX_SRCS-yes += encoder/mcomp.h
-VP8_CX_SRCS-yes += encoder/modecosts.h
-VP8_CX_SRCS-yes += encoder/onyx_int.h
-VP8_CX_SRCS-yes += encoder/psnr.h
-VP8_CX_SRCS-yes += encoder/quantize.h
-VP8_CX_SRCS-yes += encoder/ratectrl.h
-VP8_CX_SRCS-yes += encoder/rdopt.h
-VP8_CX_SRCS-yes += encoder/tokenize.h
-VP8_CX_SRCS-yes += encoder/treewriter.h
-VP8_CX_SRCS-yes += encoder/variance.h
-VP8_CX_SRCS-yes += encoder/mcomp.c
-VP8_CX_SRCS-yes += encoder/modecosts.c
-VP8_CX_SRCS-yes += encoder/onyx_if.c
-VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/psnr.c
-VP8_CX_SRCS-yes += encoder/quantize.c
-VP8_CX_SRCS-yes += encoder/ratectrl.c
-VP8_CX_SRCS-yes += encoder/rdopt.c
-VP8_CX_SRCS-yes += encoder/sad_c.c
-VP8_CX_SRCS-yes += encoder/satd_c.c
-VP8_CX_SRCS-yes += encoder/segmentation.c
-VP8_CX_SRCS-yes += encoder/segmentation.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
-VP8_CX_SRCS-yes += encoder/tokenize.c
-VP8_CX_SRCS-yes += encoder/treewriter.c
-VP8_CX_SRCS-yes += encoder/variance_c.c
-ifeq ($(CONFIG_POSTPROC),yes)
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
-endif
-VP8_CX_SRCS-yes += encoder/temporal_filter.c
-VP8_CX_SRCS-yes += encoder/temporal_filter.h
-VP8_CX_SRCS-yes += encoder/mbgraph.c
-VP8_CX_SRCS-yes += encoder/mbgraph.h
-
-
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
-
-
-VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
--- a/vp8/vp8cx_arm.mk
+++ /dev/null
@@ -1,63 +1,0 @@
-##
-## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-
-
-#VP8_CX_SRCS list is modified according to different platforms.
-
-#File list for arm
-# encoder
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
-
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.h
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/encodemb_arm.h
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.h
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.c
-VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.h
-
-#File list for armv5te
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
-
-#File list for armv6
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
-
-#File list for neon
-# encoder
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/subtract_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/variance_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
--- a/vp8/vp8dx.mk
+++ /dev/null
@@ -1,71 +1,0 @@
-##
-## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-
-
-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
-
-VP8_DX_EXPORTS += exports_dec
-
-VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
-VP8_DX_SRCS-no += $(VP8_COMMON_SRCS-no)
-VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
-VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
-
-ifeq ($(ARCH_ARM),yes)
- include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx_arm.mk
-endif
-
-VP8_DX_SRCS-yes += vp8_dx_iface.c
-
-# common
-#define ARM
-#define DISABLE_THREAD
-
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += decoder
-
-
-
-# decoder
-#define ARM
-#define DISABLE_THREAD
-
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += decoder
-
-VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
-VP8_DX_SRCS-yes += decoder/dboolhuff.c
-VP8_DX_SRCS-yes += decoder/decodemv.c
-VP8_DX_SRCS-yes += decoder/decodframe.c
-VP8_DX_SRCS-yes += decoder/dequantize.c
-VP8_DX_SRCS-yes += decoder/detokenize.c
-VP8_DX_SRCS-yes += decoder/dboolhuff.h
-VP8_DX_SRCS-yes += decoder/decodemv.h
-VP8_DX_SRCS-yes += decoder/dequantize.h
-VP8_DX_SRCS-yes += decoder/detokenize.h
-VP8_DX_SRCS-yes += decoder/onyxd_int.h
-VP8_DX_SRCS-yes += decoder/treereader.h
-VP8_DX_SRCS-yes += decoder/onyxd_if.c
-VP8_DX_SRCS-yes += decoder/idct_blk.c
-
-VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-
-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
--- a/vp8/vp8dx_arm.mk
+++ /dev/null
@@ -1,29 +1,0 @@
-##
-## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-## Use of this source code is governed by a BSD-style license
-## that can be found in the LICENSE file in the root of the source
-## tree. An additional intellectual property rights grant can be found
-## in the file PATENTS. All contributing project authors may
-## be found in the AUTHORS file in the root of the source tree.
-##
-
-
-#VP8_DX_SRCS list is modified according to different platforms.
-
-VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
-
-#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
-
-#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c
--- /dev/null
+++ b/vp9/common/alloccommon.c
@@ -1,0 +1,220 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "entropymv.h"
+#include "systemdependent.h"
+
+
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {
+ int stride = cpi->mode_info_stride;
+ int i;
+
+ // Clear down top border row
+ vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);
+
+ // Clear left border column
+ for (i = 1; i < cpi->mb_rows + 1; i++) {
+ vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));
+ }
+}
+
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {
+ int i, j;
+
+ // For each in image mode_info element set the in image flag to 1
+ for (i = 0; i < cpi->mb_rows; i++) {
+ for (j = 0; j < cpi->mb_cols; j++) {
+ mi->mbmi.mb_in_image = 1;
+ mi++; // Next element in the row
+ }
+
+ mi++; // Step over border element at start of next row
+ }
+}
+
+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
+ int i;
+
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+ vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+
+ vpx_free(oci->above_context);
+ vpx_free(oci->mip);
+ vpx_free(oci->prev_mip);
+
+ oci->above_context = 0;
+ oci->mip = 0;
+ oci->prev_mip = 0;
+
+}
+
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+ int i;
+
+ vp9_de_alloc_frame_buffers(oci);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ for (i = 0; i < NUM_YV12_BUFFERS; i++) {
+ oci->fb_idx_ref_cnt[i] = 0;
+ oci->yv12_fb[i].flags = 0;
+ if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+ }
+
+ oci->new_fb_idx = 0;
+ oci->lst_fb_idx = 1;
+ oci->gld_fb_idx = 2;
+ oci->alt_fb_idx = 3;
+
+ oci->fb_idx_ref_cnt[0] = 1;
+ oci->fb_idx_ref_cnt[1] = 1;
+ oci->fb_idx_ref_cnt[2] = 1;
+ oci->fb_idx_ref_cnt[3] = 1;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+
+ oci->mb_rows = height >> 4;
+ oci->mb_cols = width >> 4;
+ oci->MBs = oci->mb_rows * oci->mb_cols;
+ oci->mode_info_stride = oci->mb_cols + 1;
+ oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+ if (!oci->mip) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+
+ oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+ /* allocate memory for last frame MODE_INFO array */
+
+ oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+ if (!oci->prev_mip) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+
+ oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+
+ oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+ if (!oci->above_context) {
+ vp9_de_alloc_frame_buffers(oci);
+ return 1;
+ }
+
+ vp9_update_mode_info_border(oci, oci->mip);
+ vp9_update_mode_info_in_image(oci, oci->mi);
+
+ return 0;
+}
+void vp9_setup_version(VP9_COMMON *cm) {
+ if (cm->version & 0x4) {
+ if (!CONFIG_EXPERIMENTAL)
+ vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+ "Bitstream was created by an experimental "
+ "encoder");
+ cm->experimental = 1;
+ }
+
+ switch (cm->version & 0x3) {
+ case 0:
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ case 1:
+ cm->no_lpf = 0;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 2:
+ case 3:
+ cm->no_lpf = 1;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ // Full pel only code deprecated in experimental code base
+ // case 3:
+ // cm->no_lpf = 1;
+ // cm->filter_type = SIMPLE_LOOPFILTER;
+ // cm->use_bilinear_mc_filter = 1;
+ // cm->full_pixel = 1;
+ // break;
+ }
+}
+void vp9_create_common(VP9_COMMON *oci) {
+ vp9_machine_specific_config(oci);
+
+ vp9_init_mbmode_probs(oci);
+
+ vp9_default_bmode_probs(oci->fc.bmode_prob);
+
+ oci->txfm_mode = ONLY_4X4;
+ oci->mb_no_coeff_skip = 1;
+ oci->comp_pred_mode = HYBRID_PREDICTION;
+ oci->no_lpf = 0;
+ oci->filter_type = NORMAL_LOOPFILTER;
+ oci->use_bilinear_mc_filter = 0;
+ oci->full_pixel = 0;
+ oci->clr_type = REG_YUV;
+ oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+ /* Initialise reference frame sign bias structure to defaults */
+ vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+ /* Default disable buffer to buffer copying */
+ oci->copy_buffer_to_gf = 0;
+ oci->copy_buffer_to_arf = 0;
+ oci->kf_ymode_probs_update = 0;
+}
+
+void vp9_remove_common(VP9_COMMON *oci) {
+ vp9_de_alloc_frame_buffers(oci);
+}
+
+void vp9_initialize_common() {
+ vp9_coef_tree_initialize();
+
+ vp9_entropy_mode_init();
+
+ vp9_entropy_mv_init();
+}
--- /dev/null
+++ b/vp9/common/alloccommon.h
@@ -1,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp9_create_common(VP9_COMMON *oci);
+void vp9_remove_common(VP9_COMMON *oci);
+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
+void vp9_setup_version(VP9_COMMON *oci);
+
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/arm_systemdependent.c
@@ -1,0 +1,92 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "vp9/common/pragmas.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/recon.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/onyxc_int.h"
+
+void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+ int flags = arm_cpu_caps();
+ rtcd->flags = flags;
+
+ /* Override default functions with fastest ones for this CPU. */
+#if HAVE_ARMV5TE
+ if (flags & HAS_EDSP) {
+ }
+#endif
+
+// The commented functions need to be re-written for vpx.
+#if HAVE_ARMV6
+ if (flags & HAS_MEDIA) {
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6;
+ rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6;
+
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6;
+ rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6;
+ rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6;
+
+ // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6;
+ // rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual;
+ // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6;
+ // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6;
+
+ rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6;
+ rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6;
+ rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6;
+ rtcd->recon.recon = vp9_recon_b_armv6;
+ rtcd->recon.recon2 = vp9_recon2b_armv6;
+ rtcd->recon.recon4 = vp9_recon4b_armv6;
+ }
+#endif
+
+#if HAVE_ARMV7
+ if (flags & HAS_NEON) {
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon;
+ rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon;
+
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon;
+ rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon;
+ rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon;
+
+ // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon;
+ // rtcd->idct.idct16 = vp9_short_idct4x4llm_neon;
+ // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon;
+ // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon;
+
+ rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon;
+ rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon;
+ rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon;
+ rtcd->recon.recon = vp9_recon_b_neon;
+ rtcd->recon.recon2 = vp9_recon2b_neon;
+ rtcd->recon.recon4 = vp9_recon4b_neon;
+ rtcd->recon.recon_mb = vp9_recon_mb_neon;
+ rtcd->recon.build_intra_predictors_mby =
+ vp9_build_intra_predictors_mby_neon;
+ rtcd->recon.build_intra_predictors_mby_s =
+ vp9_build_intra_predictors_mby_s_neon;
+ }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/common/arm/armv6/bilinearfilter_v6.asm
@@ -1,0 +1,237 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_filter_block2d_bil_first_pass_armv6|
+ EXPORT |vp9_filter_block2d_bil_second_pass_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned short *dst_ptr,
+; r2 unsigned int src_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vp9_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp9_filter_block2d_bil_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp9_filter address
+ ldr r4, [sp, #36] ; width
+
+ mov r12, r3 ; outer-loop counter
+
+ add r7, r2, r4 ; preload next row
+ pld [r0, r7]
+
+ sub r2, r2, r4 ; src increment for height loop
+
+ ldr r5, [r11] ; load up filter coefficients
+
+ mov r3, r3, lsl #1 ; height*2
+ add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+ mov r11, r1 ; save dst_ptr for each row
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+ ldrb r6, [r0] ; load source data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ mov lr, r4, lsr #2 ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+ ldrb r9, [r0, #3]
+ ldrb r10, [r0, #4]
+
+ pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
+
+ smuad r6, r6, r5 ; apply the filter
+ pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
+ smuad r7, r7, r5
+ pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
+
+ smuad r8, r8, r5
+ smuad r9, r9, r5
+
+ add r0, r0, #4
+ subs lr, lr, #1
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #16, r6, asr #7
+ usat r7, #16, r7, asr #7
+
+ strh r6, [r1], r3 ; result is transposed and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strh r7, [r1], r3
+ add r9, r9, #0x40
+ usat r8, #16, r8, asr #7
+ usat r9, #16, r9, asr #7
+
+ strh r8, [r1], r3 ; result is transposed and stored
+
+ ldrneb r6, [r0] ; load source data
+ strh r9, [r1], r3
+
+ ldrneb r7, [r0, #1]
+ ldrneb r8, [r0, #2]
+
+ bne bil_width_loop_1st_v6
+
+ add r0, r0, r2 ; move to next input row
+ subs r12, r12, #1
+
+ add r9, r2, r4, lsl #1 ; adding back block width
+ pld [r0, r9] ; preload next row
+
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_1st_v6
+
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+ mov lr, r4, lsr #2 ; loop counter
+
+|bil_width_loop_null_1st|
+ ldrb r6, [r0] ; load data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ ldrb r9, [r0, #3]
+
+ strh r6, [r1], r3 ; store it to immediate buffer
+ add r0, r0, #4
+ strh r7, [r1], r3
+ subs lr, lr, #1
+ strh r8, [r1], r3
+ strh r9, [r1], r3
+
+ bne bil_width_loop_null_1st
+
+ subs r12, r12, #1
+ add r0, r0, r2 ; move to next input line
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_null_1st
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vp9_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0 unsigned short *src_ptr,
+; r1 unsigned char *dst_ptr,
+; r2 int dst_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vp9_filter
+;---------------------------------
+|vp9_filter_block2d_bil_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp9_filter address
+ ldr r4, [sp, #36] ; width
+
+ ldr r5, [r11] ; load up filter coefficients
+ mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
+ mov r11, r1
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+ ldr r6, [r0] ; load the data
+ ldr r8, [r0, #4]
+ ldrh r10, [r0, #8]
+ mov lr, r3, lsr #2 ; loop counter
+
+|bil_width_loop_2nd|
+ pkhtb r7, r6, r8 ; src[1] | src[2]
+ pkhtb r9, r8, r10 ; src[3] | src[4]
+
+ smuad r6, r6, r5 ; apply filter
+ smuad r8, r8, r5 ; apply filter
+
+ subs lr, lr, #1
+
+ smuadx r7, r7, r5 ; apply filter
+ smuadx r9, r9, r5 ; apply filter
+
+ add r0, r0, #8
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #8, r6, asr #7
+ usat r7, #8, r7, asr #7
+ strb r6, [r1], r2 ; the result is transposed back and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strb r7, [r1], r2
+ add r9, r9, #0x40
+ usat r8, #8, r8, asr #7
+ usat r9, #8, r9, asr #7
+ strb r8, [r1], r2 ; the result is transposed back and stored
+
+ ldrne r6, [r0] ; load data
+ strb r9, [r1], r2
+ ldrne r8, [r0, #4]
+ ldrneh r10, [r0, #8]
+
+ bne bil_width_loop_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4 ; update src for next row
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_2nd
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+ mov lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+ ldr r6, [r0], #4 ; load data
+ subs lr, lr, #1
+ ldr r8, [r0], #4
+
+ strb r6, [r1], r2 ; store data
+ mov r7, r6, lsr #16
+ strb r7, [r1], r2
+ mov r9, r8, lsr #16
+ strb r8, [r1], r2
+ strb r9, [r1], r2
+
+ bne bil_width_loop_null_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_null_2nd
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_filter_block2d_second_pass_armv6|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem16x16_v6.asm
@@ -1,0 +1,186 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem16x16_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem16x16_v6| PROC
+ stmdb sp!, {r4 - r7}
+ ;push {r4-r7}
+
+ ;preload
+ pld [r0, #31] ; preload for next 16x16 block
+
+ ands r4, r0, #15
+ beq copy_mem16x16_fast
+
+ ands r4, r0, #7
+ beq copy_mem16x16_8
+
+ ands r4, r0, #3
+ beq copy_mem16x16_4
+
+ ;copy one byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+ ldrb r6, [r0, #2]
+ ldrb r7, [r0, #3]
+
+ mov r12, #16
+
+copy_mem16x16_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+ strb r6, [r2, #2]
+ strb r7, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+ ldrb r6, [r0, #6]
+ ldrb r7, [r0, #7]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+ strb r6, [r2, #6]
+ strb r7, [r2, #7]
+
+ ldrb r4, [r0, #8]
+ ldrb r5, [r0, #9]
+ ldrb r6, [r0, #10]
+ ldrb r7, [r0, #11]
+
+ strb r4, [r2, #8]
+ strb r5, [r2, #9]
+ strb r6, [r2, #10]
+ strb r7, [r2, #11]
+
+ ldrb r4, [r0, #12]
+ ldrb r5, [r0, #13]
+ ldrb r6, [r0, #14]
+ ldrb r7, [r0, #15]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #12]
+ strb r5, [r2, #13]
+ strb r6, [r2, #14]
+ strb r7, [r2, #15]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+ ldrneb r6, [r0, #2]
+ ldrneb r7, [r0, #3]
+
+ pld [r0, #31] ; preload for next 16x16 block
+
+ bne copy_mem16x16_1_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+ ldr r6, [r0, #8]
+ ldr r7, [r0, #12]
+
+ mov r12, #16
+
+copy_mem16x16_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+ str r6, [r2, #8]
+ str r7, [r2, #12]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+ ldrne r6, [r0, #8]
+ ldrne r7, [r0, #12]
+
+ pld [r0, #31] ; preload for next 16x16 block
+
+ bne copy_mem16x16_4_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+ sub r1, r1, #16
+ sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_8_loop
+ ldmia r0!, {r4-r5}
+ ;ldm r0, {r4-r5}
+ ldmia r0!, {r6-r7}
+
+ add r0, r0, r1
+
+ stmia r2!, {r4-r5}
+ subs r12, r12, #1
+ ;stm r2, {r4-r5}
+ stmia r2!, {r6-r7}
+
+ add r2, r2, r3
+
+ pld [r0, #31] ; preload for next 16x16 block
+ bne copy_mem16x16_8_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+ ;sub r1, r1, #16
+ ;sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_fast_loop
+ ldmia r0, {r4-r7}
+ ;ldm r0, {r4-r7}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r7}
+ ;stm r2, {r4-r7}
+ add r2, r2, r3
+
+ pld [r0, #31] ; preload for next 16x16 block
+ bne copy_mem16x16_fast_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem16x16_v6|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem8x4_v6.asm
@@ -1,0 +1,128 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem8x4_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x4_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x4_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x4_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #4
+
+copy_mem8x4_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x4_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #4
+
+copy_mem8x4_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x4_4_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #4
+
+copy_mem8x4_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x4_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem8x4_v6|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/copymem8x8_v6.asm
@@ -1,0 +1,128 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem8x8_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x8_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x8_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x8_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #8
+
+copy_mem8x8_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x8_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #8
+
+copy_mem8x8_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x8_4_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #8
+
+copy_mem8x8_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x8_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem8x8_v6|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -1,0 +1,67 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dc_only_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
+; unsigned char *dst_ptr, int pitch, int stride)
+; r0 input_dc
+; r1 pred_ptr
+; r2 dest_ptr
+; r3 pitch
+; sp stride
+
+|vp8_dc_only_idct_add_v6| PROC
+ stmdb sp!, {r4 - r7, lr}
+
+ add r0, r0, #4 ; input_dc += 4
+ ldr r12, c0x0000FFFF
+ ldr r4, [r1], r3
+ ldr r6, [r1], r3
+ and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
+ ldr lr, [sp, #20]
+ orr r0, r0, r0, lsl #16 ; a1 | a1
+
+ uxtab16 r5, r0, r4 ; a1+2 | a1+0
+ uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ ldr r4, [r1], r3
+ ldr r6, [r1]
+ str r5, [r2], lr
+ str r7, [r2], lr
+
+ uxtab16 r5, r0, r4
+ uxtab16 r4, r0, r4, ror #8
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ str r5, [r2], lr
+ str r7, [r2]
+
+ ldmia sp!, {r4 - r7, pc}
+
+ ENDP ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/filter_v6.asm
@@ -1,0 +1,624 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_filter_block2d_first_pass_armv6|
+ EXPORT |vp9_filter_block2d_first_pass_16x16_armv6|
+ EXPORT |vp9_filter_block2d_first_pass_8x8_armv6|
+ EXPORT |vp9_filter_block2d_second_pass_armv6|
+ EXPORT |vp9_filter4_block2d_second_pass_armv6|
+ EXPORT |vp9_filter_block2d_first_pass_only_armv6|
+ EXPORT |vp9_filter_block2d_second_pass_only_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr
+; r1 short *output_ptr
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp9_filter
+;-------------------------------------
+; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp9_filter_block2d_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp9_filter address
+ ldr r7, [sp, #36] ; output height
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp9_filter_block2d_first_pass_16x16_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp9_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #18 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_16_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_16_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #34 ; adding back block width(=16)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_16_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp9_filter_block2d_first_pass_8x8_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp9_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #10 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_8_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_8_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #18 ; adding back block width(=8)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_8_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp9_filter
+;---------------------------------
+|vp9_filter_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp9_filter address
+ sub sp, sp, #4
+ mov r7, r3, lsl #16 ; height is top part of counter
+ str r1, [sp] ; push destination to stack
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+
+ sub r0, r0, #4 ; offset input buffer
+
+|height_loop_2nd|
+ ldr r8, [r0] ; load the data
+ ldr r9, [r0, #4]
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd|
+ smuad lr, r4, r8 ; apply filter
+ sub r7, r7, #1
+ smulbt r8, r4, r8
+
+ ldr r10, [r0, #8]
+
+ smlad lr, r5, r9, lr
+ smladx r8, r12, r9, r8
+
+ ldrh r9, [r0, #12]
+
+ smlad lr, r6, r10, lr
+ smladx r8, r11, r10, r8
+
+ add r0, r0, #4
+ smlatb r10, r6, r9, r8
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ands r8, r7, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r2 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ ldrne r8, [r0] ; load data for next loop
+ ldrne r9, [r0, #4]
+ strb r10, [r1], r2
+
+ bne width_loop_2nd
+
+ ldr r1, [sp] ; update dst for next loop
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; updata src for next loop
+ add r1, r1, #1
+ str r1, [sp]
+
+ bne height_loop_2nd
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp9_filter
+;---------------------------------
+|vp9_filter4_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp9_filter address
+ mov r7, r3, lsl #16 ; height is top part of counter
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ add lr, r1, r3 ; save final destination pointer
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+ mov r4, #0x40 ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+ ldrd r8, [r0, #-4] ; load the data
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd_4|
+ ldr r10, [r0, #4]!
+ smladx r6, r9, r12, r4 ; apply filter
+ pkhbt r8, r9, r8
+ smlad r5, r8, r12, r4
+ pkhbt r8, r10, r9
+ smladx r6, r10, r11, r6
+ sub r7, r7, #1
+ smlad r5, r8, r11, r5
+
+ mov r8, r9 ; shift the data for the next loop
+ mov r9, r10
+
+ usat r6, #8, r6, asr #7 ; shift and clamp
+ usat r5, #8, r5, asr #7
+
+ strb r5, [r1], r2 ; the result is transposed back and stored
+ tst r7, #0xff
+ strb r6, [r1], r2
+
+ bne width_loop_2nd_4
+
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; update src for next loop
+ sub r1, lr, r7, lsr #16 ; update dst for next loop
+
+ bne height_loop_2nd_4
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;------------------------------------
+; r0 unsigned char *src_ptr
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp9_filter
+;------------------------------------
+|vp9_filter_block2d_first_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ add r7, r2, r3 ; preload next low
+ add r7, r7, #2
+ pld [r0, r7]
+
+ ldr r4, [sp, #36] ; output pitch
+ ldr r11, [sp, #40] ; HFilter address
+ sub sp, sp, #8
+
+ mov r7, r3
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ sub r4, r4, r3
+ str r4, [sp] ; save modified output pitch
+ str r2, [sp, #4]
+
+ mov r2, #0x40
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+ ldrb r8, [r0, #-2] ; load data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+
+ mov r12, r3, lsr #1 ; loop counter
+
+|width_loop_1st_only_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+;; smuad lr, lr, r4
+ smlad lr, lr, r4, r2
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+;; smuad r8, r8, r4
+ smlad r8, r8, r4, r2
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ subs r12, r12, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+;; add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+;; add r10, r10, #0x40
+ strb lr, [r1], #1 ; store the result
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0, #-1]
+ strb r10, [r1], #1
+ ldrneb r10, [r0], #2
+
+ bne width_loop_1st_only_6
+
+ ldr lr, [sp] ; load back output pitch
+ ldr r12, [sp, #4] ; load back output pitch
+ subs r7, r7, #1
+ add r0, r0, r12 ; updata src for next loop
+
+ add r11, r12, r3 ; preload next low
+ add r11, r11, #2
+ pld [r0, r11]
+
+ add r1, r1, lr ; update dst for next loop
+
+ bne height_loop_1st_only_6
+
+ add sp, sp, #8
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp9_filter
+;------------------------------------
+|vp9_filter_block2d_second_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; VFilter address
+ ldr r12, [sp, #36] ; output pitch
+
+ mov r7, r3, lsl #16 ; height is top part of counter
+ sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
+
+ sub sp, sp, #8
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r0, [sp] ; save r0 to stack
+ str r1, [sp, #4] ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+ ldrb r8, [r0], r2 ; load data
+ orr r7, r7, r3 ; loop counter
+ ldrb r9, [r0], r2
+ ldrb r10, [r0], r2
+
+|height_loop_2nd_only_6|
+ ; filter first column in this inner loop, than, move to next colum.
+ ldrb r11, [r0], r2
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0], r2
+
+ smuad lr, lr, r4
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0], r2
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0]
+
+ sub r7, r7, #2
+ sub r0, r0, r2, lsl #2
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+ ands r9, r7, #0xff
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0], r2 ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r12 ; store the result for the column
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0], r2
+ strb r10, [r1], r12
+ ldrneb r10, [r0], r2
+
+ bne height_loop_2nd_only_6
+
+ ldr r0, [sp]
+ ldr r1, [sp, #4]
+ subs r7, r7, #0x10000
+ add r0, r0, #1 ; move to filter next column
+ str r0, [sp]
+ add r1, r1, #1
+ str r1, [sp, #4]
+
+ bne width_loop_2nd_only_6
+
+ add sp, sp, #8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_filter_block2d_second_pass_only_armv6|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/idct_v6.asm
@@ -1,0 +1,345 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
+ EXPORT |vp8_short_idct4x4llm_1_v6|
+ EXPORT |vp8_short_idct4x4llm_v6|
+ EXPORT |vp8_short_idct4x4llm_v6_scott|
+ EXPORT |vp8_short_idct4x4llm_v6_dual|
+
+ AREA |.text|, CODE, READONLY
+
+;********************************************************************************
+;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench: 3/5
+;********************************************************************************
+
+|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
+ ;
+ ldrsh r0, [r0] ; load input[0] 1, r0 un 2
+ add r0, r0, #4 ; 1 +4
+ stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
+ mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
+ pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
+ mov r5, r4 ; expand expand
+
+ strd r4, [r1], r2 ; *output = r0, post inc 1
+ strd r4, [r1], r2 ; 1
+ strd r4, [r1], r2 ; 1
+ strd r4, [r1] ; 1
+ ;
+ ldmia sp!, {r4, r5, pc} ; replace vars, return restore
+ ENDP ; |vp8_short_idct4x4llm_1_v6|
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
+ ;
+ stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
+ ;
+ mov r4, #0x00004E00 ; 1 cst
+ orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
+ mov r5, #0x00008A00 ; 1 cst
+ orr r5, r5, #0x0000008C ; sinpi8sqrt2
+ ;
+ mov r6, #4 ; i=4 1 i
+loop1 ;
+ ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
+ ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
+ ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
+ ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
+ smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
+ smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
+ add r9, r7, r8 ; a1 = [0] + [8] 1 a1
+ sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
+ add r11, r3, r11 ; temp2 1
+ rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
+ smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
+ smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
+ add r8, r7, r11 ; b1 + c1 1 b+c
+ strh r8, [r1, r2] ; out[pitch] = b1+c1 1
+ sub r7, r7, r11 ; b1 - c1 1 b-c
+ add r10, r12, r10 ; temp1 1
+ add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
+ add r10, r9, r3 ; a1 + d1 1 a+d
+ sub r3, r9, r3 ; a1 - d1 1 a-d
+ add r8, r2, r2 ; pitch * 2 1 p*2
+ strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
+ add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
+ strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
+ subs r6, r6, #1 ; i-- 1 --
+ strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
+ bne loop1 ; if i>0, continue
+ ;
+ sub r1, r1, #8 ; set up out for next loop 1 -4
+ ; for this iteration, input=prev output
+ mov r6, #4 ; i=4 1 i
+; b returnfull
+loop2 ;
+ ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
+ ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
+ ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
+ ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
+ smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
+ smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
+ add r7, r0, r3 ; a1 = [0] + [2] 1 a1
+ sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
+ add r10, r8, r10 ; temp2 1
+ rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
+ smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
+ smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
+ add r3, r0, r9 ; b1+c1 1 b+c
+ add r3, r3, #4 ; b1+c1+4 1 +4
+ add r10, r11, r10 ; temp1 1
+ mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
+ strh r3, [r1, #2] ; out[1] = b1+c1 1
+ add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
+ add r3, r7, r10 ; a1+d1 1 a+d
+ add r3, r3, #4 ; a1+d1+4 1 +4
+ sub r7, r7, r10 ; a1-d1 1 a-d
+ add r7, r7, #4 ; a1-d1+4 1 +4
+ mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
+ mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
+ strh r7, [r1, #6] ; out[3] = a1-d1 1
+ sub r0, r0, r9 ; b1-c1 1 b-c
+ add r0, r0, #4 ; b1-c1+4 1 +4
+ subs r6, r6, #1 ; i-- 1 --
+ mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
+ strh r0, [r1, #4] ; out[2] = b1-c1 1
+ strh r3, [r1], r2 ; out[0] = a1+d1 1
+; add r1, r1, r2 ; out += pitch 1 ++
+ bne loop2 ; if i>0, continue
+returnfull ;
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+ ENDP
+
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
+; mov r0, #0 ;
+; ldr r0, [r0] ;
+ stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
+ ;
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ ;
+ mov r5, #0x2 ; i i
+ ;
+short_idct4x4llm_v6_scott_loop1 ;
+ ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
+ ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
+ ;
+ smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
+ smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
+ ;
+ smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
+ smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
+ ;
+ add r6, r6, r7 ; partial c1 lt1-lt2
+ add r12, r12, r14 ; partial d1 l2t2+l2t1
+ ;
+ smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
+ smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
+ ;
+ smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
+ smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
+ ;
+ add r7, r14, r7 ; partial c1_2 ht1+ht2
+ sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
+ ;
+ pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
+ pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
+ ;
+ usub16 r6, r6, r10 ; c1_2 | c1_1 c
+ uadd16 r12, r12, r11 ; d1_2 | d1_1 d
+ ;
+ ldr r10, [r0, #0] ; i1 | i0 1,0
+ ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
+ ;
+;;;;;; add r0, r0, #0x4 ; +4
+;;;;;; add r1, r1, #0x4 ; +4
+ ;
+ uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
+ usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
+ ;
+ uadd16 r7, r8, r12 ; a1 + d1 pair a+d
+ usub16 r14, r8, r12 ; a1 - d1 pair a-d
+ ;
+ str r7, [r1] ; op[0] = a1 + d1
+ str r14, [r1, r2] ; op[pitch*3] = a1 - d1
+ ;
+ add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
+ add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
+ ;
+ subs r5, r5, #0x1 ; --
+ bne short_idct4x4llm_v6_scott_loop1 ;
+ ;
+ sub r1, r1, #16 ; reset output ptr
+ mov r5, #0x4 ;
+ mov r0, r1 ; input = output
+ ;
+short_idct4x4llm_v6_scott_loop2 ;
+ ;
+ subs r5, r5, #0x1 ;
+ bne short_idct4x4llm_v6_scott_loop2 ;
+ ;
+ ldmia sp!, {r4 - r11, pc} ;
+ ENDP ;
+ ;
+;********************************************************************************
+;********************************************************************************
+;********************************************************************************
+
+;********************************************************************************
+;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
+;* r0 INT16 * input
+;* r1 INT16 * output
+;* r2 INT32 pitch
+;* bench:
+;********************************************************************************
+
+|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
+ ;
+ stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
+ mov r3, #0x00004E00 ; cos
+ orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+ mov r5, #0x2 ; i=2 i
+loop1_dual
+ ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
+ ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
+ ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
+
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
+ pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
+ smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
+ pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
+ smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
+ subs r5, r5, #0x1 ; i-- --
+ pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
+ ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
+ pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+ usub16 r7, r8, r7 ; c c
+ uadd16 r6, r6, r10 ; d d
+ uadd16 r10, r11, r14 ; a a
+ usub16 r8, r11, r14 ; b b
+ uadd16 r9, r10, r6 ; a+d a+d
+ usub16 r10, r10, r6 ; a-d a-d
+ uadd16 r6, r8, r7 ; b+c b+c
+ usub16 r7, r8, r7 ; b-c b-c
+ str r6, [r1, r2] ; o5 | o4
+ add r6, r2, r2 ; pitch * 2 p2
+ str r7, [r1, r6] ; o9 | o8
+ add r6, r6, r2 ; pitch * 3 p3
+ str r10, [r1, r6] ; o13 | o12
+ str r9, [r1], #0x4 ; o1 | o0 ++
+ bne loop1_dual ;
+ mov r5, #0x2 ; i=2 i
+ sub r0, r1, #8 ; reset input/output i/o
+loop2_dual
+ ldr r6, [r0, r2] ; i5 | i4 5|4
+ ldr r1, [r0] ; i1 | i0 1|0
+ ldr r12, [r0, #0x4] ; i3 | i2 3|2
+ add r14, r2, #0x4 ; pitch + 2 p+2
+ ldr r14, [r0, r14] ; i7 | i6 7|6
+ smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
+ smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
+ smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
+ pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
+ pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 � tc1
+ pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
+ uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
+ uadd16 r10, r11, r9 ; a a
+ usub16 r9, r11, r9 ; b b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
+ subs r5, r5, #0x1 ; i-- --
+ smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
+ smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
+
+ pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
+ usub16 r12, r8, r6 ; c (o1 | o5) c
+ uadd16 r6, r11, r1 ; d (o3 | o7) d
+ uadd16 r7, r10, r6 ; a+d a+d
+ mov r8, #0x4 ; set up 4's 4
+ orr r8, r8, #0x40000 ; 4|4
+ usub16 r6, r10, r6 ; a-d a-d
+ uadd16 r6, r6, r8 ; a-d+4 3|7
+ uadd16 r7, r7, r8 ; a+d+4 0|4
+ uadd16 r10, r9, r12 ; b+c b+c
+ usub16 r1, r9, r12 ; b-c b-c
+ uadd16 r10, r10, r8 ; b+c+4 1|5
+ uadd16 r1, r1, r8 ; b-c+4 2|6
+ mov r8, r10, asr #19 ; o1 >> 3
+ strh r8, [r0, #2] ; o1
+ mov r8, r1, asr #19 ; o2 >> 3
+ strh r8, [r0, #4] ; o2
+ mov r8, r6, asr #19 ; o3 >> 3
+ strh r8, [r0, #6] ; o3
+ mov r8, r7, asr #19 ; o0 >> 3
+ strh r8, [r0], r2 ; o0 +p
+ sxth r10, r10 ;
+ mov r8, r10, asr #3 ; o5 >> 3
+ strh r8, [r0, #2] ; o5
+ sxth r1, r1 ;
+ mov r8, r1, asr #3 ; o6 >> 3
+ strh r8, [r0, #4] ; o6
+ sxth r6, r6 ;
+ mov r8, r6, asr #3 ; o7 >> 3
+ strh r8, [r0, #6] ; o7
+ sxth r7, r7 ;
+ mov r8, r7, asr #3 ; o4 >> 3
+ strh r8, [r0], r2 ; o4 +p
+;;;;; subs r5, r5, #0x1 ; i-- --
+ bne loop2_dual ;
+ ;
+ ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/iwalsh_v6.asm
@@ -1,0 +1,152 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_inv_walsh4x4_v6|
+ EXPORT |vp8_short_inv_walsh4x4_1_v6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r2, [r0], #4 ; [1 | 0]
+ ldr r3, [r0], #4 ; [3 | 2]
+ ldr r4, [r0], #4 ; [5 | 4]
+ ldr r5, [r0], #4 ; [7 | 6]
+ ldr r6, [r0], #4 ; [9 | 8]
+ ldr r7, [r0], #4 ; [11 | 10]
+ ldr r8, [r0], #4 ; [13 | 12]
+ ldr r9, [r0] ; [15 | 14]
+
+ qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
+ qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
+ qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
+ qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
+
+ qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
+ qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
+ qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
+ qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
+
+ qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
+ qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
+ qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
+ qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
+
+ qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
+ qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
+ qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
+ qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
+
+ ; first transform complete
+
+ qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
+ qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
+ qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
+ qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
+
+ qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
+ ldr r10, c0x00030003
+ qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r2, r2, r10 ; [b2+3|c2+3]
+ qadd16 r3, r3, r10 ; [a2+3|d2+3]
+ qadd16 r4, r4, r10 ; [b2+3|c2+3]
+ qadd16 r5, r5, r10 ; [a2+3|d2+3]
+
+ asr r12, r2, #3 ; [1 | x]
+ pkhtb r12, r12, r3, asr #19; [1 | 0]
+ lsl lr, r3, #16 ; [~3 | x]
+ lsl r2, r2, #16 ; [~2 | x]
+ asr lr, lr, #3 ; [3 | x]
+ pkhtb lr, lr, r2, asr #19 ; [3 | 2]
+
+ asr r2, r4, #3 ; [5 | x]
+ pkhtb r2, r2, r5, asr #19 ; [5 | 4]
+ lsl r3, r5, #16 ; [~7 | x]
+ lsl r4, r4, #16 ; [~6 | x]
+ asr r3, r3, #3 ; [7 | x]
+ pkhtb r3, r3, r4, asr #19 ; [7 | 6]
+
+ str r12, [r1], #4
+ str lr, [r1], #4
+ str r2, [r1], #4
+ str r3, [r1], #4
+
+ qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
+ qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
+ qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
+ qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
+
+ qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
+ qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r6, r6, r10 ; [b2+3|c2+3]
+ qadd16 r7, r7, r10 ; [a2+3|d2+3]
+ qadd16 r8, r8, r10 ; [b2+3|c2+3]
+ qadd16 r9, r9, r10 ; [a2+3|d2+3]
+
+ asr r2, r6, #3 ; [9 | x]
+ pkhtb r2, r2, r7, asr #19 ; [9 | 8]
+ lsl r3, r7, #16 ; [~11| x]
+ lsl r4, r6, #16 ; [~10| x]
+ asr r3, r3, #3 ; [11 | x]
+ pkhtb r3, r3, r4, asr #19 ; [11 | 10]
+
+ asr r4, r8, #3 ; [13 | x]
+ pkhtb r4, r4, r9, asr #19 ; [13 | 12]
+ lsl r5, r9, #16 ; [~15| x]
+ lsl r6, r8, #16 ; [~14| x]
+ asr r5, r5, #3 ; [15 | x]
+ pkhtb r5, r5, r6, asr #19 ; [15 | 14]
+
+ str r2, [r1], #4
+ str r3, [r1], #4
+ str r4, [r1], #4
+ str r5, [r1]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_short_inv_walsh4x4_v6|
+
+
+;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_v6| PROC
+
+ ldrsh r2, [r0] ; [0]
+ add r2, r2, #3 ; [0] + 3
+ asr r2, r2, #3 ; a1 ([0]+3) >> 3
+ lsl r2, r2, #16 ; [a1 | x]
+ orr r2, r2, r2, lsr #16 ; [a1 | a1]
+
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1], #4
+ str r2, [r1]
+
+ bx lr
+ ENDP ; |vp8_short_inv_walsh4x4_1_v6|
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/loopfilter_v6.asm
@@ -1,0 +1,1282 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_loop_filter_horizontal_edge_armv6|
+ EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|
+ EXPORT |vp9_loop_filter_vertical_edge_armv6|
+ EXPORT |vp9_mbloop_filter_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+src RN r0
+pstep RN r1
+count RN r5
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *blimit,
+;r3 const char *limit,
+;stack const char *thresh,
+;stack int count
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldrb r4, [r2] ; blimit
+ ldr r10, [src], pstep ; p2
+ ldrb r2, [r3] ; limit
+ ldr r11, [src], pstep ; p1
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|Hnext8|
+ ; vp9_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+ orr lr, lr, r10
+ sub src, src, pstep, lsl #2
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq hskip_filter ; skip filtering
+
+ sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ orr r10, r6, r8 ; calculate vp8_hevmask
+
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10 ; use usub8 instead of ssub8
+ sel r6, r12, r11 ; obtain vp8_hevmask: r6
+
+ ;vp9_filter() function
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp9_filter (r7) &= hev
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ and r7, r7, lr ; vp9_filter &= mask;
+
+ ;modify code for vp8 -- Filter1 = vp9_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
+ qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp9_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: Filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8 ,r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
+ qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2)
+
+ ;end of modification for vp8
+
+ mov lr, #0
+ sadd8 r7, r7 , r10 ; vp9_filter += 1
+ shadd8 r7, r7, lr ; vp9_filter >>= 1
+
+ ldr r11, [sp, #12] ; load ps1
+ ldr r10, [sp, #8] ; load qs1
+
+ bic r7, r7, r6 ; vp9_filter &= ~hev
+ sub src, src, pstep, lsl #2
+
+ qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
+ qsub8 r10, r10,r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
+
+ eor r11, r11, r12 ; *op1 = u^0x80
+ str r11, [src], pstep ; store op1
+ eor r9, r9, r12 ; *op0 = u^0x80
+ str r9, [src], pstep ; store op0 result
+ eor r8, r8, r12 ; *oq0 = u^0x80
+ str r8, [src], pstep ; store oq0 result
+ eor r10, r10, r12 ; *oq1 = u^0x80
+ str r10, [src], pstep ; store oq1
+
+ sub src, src, pstep, lsl #1
+
+|hskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #2
+
+ subs count, count, #1
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne Hnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldrb r4, [r2] ; blimit
+ ldr r10, [src], pstep ; p2
+ ldrb r2, [r3] ; limit
+ ldr r11, [src], pstep ; p1
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|MBHnext8|
+
+ ; vp9_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+
+ orr lr, lr, r10
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbhskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines
+ sub src, src, pstep, lsl #1
+
+ orr r10, r6, r8
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp8_mbfilter() function
+ ;p2, q2 are only needed at the end. Don't need to load them in now.
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src] ; q1
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp9_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+
+ and r7, r7, lr ; vp9_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ; vp9_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ str r8, [src] ; store *oq0
+ sub src, src, pstep
+ eor r10, r10, lr ; *op0 = s^0x80
+ str r10, [src] ; store *op0
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qadd8 r11, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u)
+ qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u)
+ eor r11, r11, lr ; *op1 = s^0x80
+ str r11, [src], pstep ; store *op1
+ eor r8, r8, lr ; *oq1 = s^0x80
+ add src, src, pstep, lsl #1
+
+ mov r7, #0x3f ; 63
+
+ str r8, [src], pstep ; store *oq1
+
+ ;roughly 1/7th difference across boundary
+ mov lr, #0x9 ; 9
+ ldr r9, [src] ; load q2
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ sub src, src, pstep
+ ldr lr, c0x80808080
+
+ ldr r11, [src] ; load p2
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ str r8, [src], pstep, lsl #2 ; store *op2
+ add src, src, pstep
+ eor r10, r10, lr ; *oq2 = s^0x80
+ str r10, [src], pstep, lsl #1 ; store *oq2
+
+|mbhskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #3
+ subs count, count, #1
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne MBHnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldrb r4, [r2] ; blimit
+ ldr r7, [src], pstep
+ ldrb r2, [r3] ; limit
+ ldr r8, [src], pstep
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|Vnext8|
+
+ ; vp9_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq vskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp9_filter() function
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ str r6, [sp]
+ str lr, [sp, #4]
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp9_filter (r7) &= hev (r7 : filter)
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp9_filter &= mask
+
+ ;modify code for vp8 -- Filter1 = vp9_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)
+ qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp9_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8, r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter)
+ qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2)
+ ;end of modification for vp8
+
+ eor r8, r8, r12
+ eor r9, r9, r12
+
+ mov lr, #0
+
+ sadd8 r7, r7, r10
+ shadd8 r7, r7, lr
+
+ ldr r10, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ bic r7, r7, r6 ; r7: vp9_filter
+
+ qsub8 r10 , r10, r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)
+ qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter)
+ eor r10, r10, r12
+ eor r11, r11, r12
+
+ sub src, src, pstep, lsl #2
+
+ ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+ ;output is b0, b1, b2, b3
+ ;b0: 03 02 01 00
+ ;b1: 13 12 11 10
+ ;b2: 23 22 21 20
+ ;b3: 33 32 31 30
+ ; p1 p0 q0 q1
+ ; (a3 a2 a1 a0)
+ TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+ strh r6, [src, #-2] ; store the result
+ mov r6, r6, lsr #16
+ strh r6, [src], pstep
+
+ strh r7, [src, #-2]
+ mov r7, r7, lsr #16
+ strh r7, [src], pstep
+
+ strh r12, [src, #-2]
+ mov r12, r12, lsr #16
+ strh r12, [src], pstep
+
+ strh lr, [src, #-2]
+ mov lr, lr, lsr #16
+ strh lr, [src], pstep
+
+|vskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ ldrne r6, [src], pstep ; load source data
+ ldrne r7, [src], pstep
+ ldrne r8, [src], pstep
+ ldrne lr, [src], pstep
+
+ bne Vnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ pld [src, #23] ; preload for next block
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldrb r4, [r2] ; blimit
+ pld [src, #23]
+ ldr r7, [src], pstep
+ ldrb r2, [r3] ; limit
+ pld [src, #23]
+ ldr r8, [src], pstep
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
+ pld [src, #23]
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|MBVnext8|
+ ; vp9_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbvskip_filter ; skip filtering
+
+
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+
+ ; vp8_mbfilter() function
+ ; p2, q2 are only needed at the end. Don't need to load them in now.
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ str r6, [sp] ; save r6
+ str lr, [sp, #4] ; save lr
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp9_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp9_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ;vp9_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ eor r10, r10, lr ; *op0 = s^0x80
+
+ strb r10, [src, #-1] ; store op0 result
+ strb r8, [src], pstep ; store oq0 result
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+ ldr lr, c0x80808080
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ add src, src, #2
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u)
+ qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u)
+ eor r8, r8, lr ; *oq1 = s^0x80
+ eor r10, r10, lr ; *op1 = s^0x80
+
+ ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary
+ strb r10, [src, #-4] ; store op1
+ strb r8, [src, #-1] ; store oq1
+ ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #8
+ orr r9, r9, r7, lsl #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #16
+ orr r9, r9, r7, lsl #16
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+ orr r11, r11, r6, lsl #24
+ orr r9, r9, r7, lsl #24
+
+ ;roughly 1/7th difference across boundary
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ mov lr, #0x9 ; 9
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ ldr lr, c0x80808080
+
+ orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ eor r10, r10, lr ; *oq2 = s^0x80
+
+ strb r8, [src, #-5] ; store *op2
+ strb r10, [src], pstep ; store *oq2
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+
+ ;adjust src pointer for next loop
+ sub src, src, #2
+
+|mbvskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ pld [src, #23] ; preload for next block
+ ldrne r6, [src], pstep ; load source data
+ pld [src, #23]
+ ldrne r7, [src], pstep
+ pld [src, #23]
+ ldrne r8, [src], pstep
+ pld [src, #23]
+ ldrne lr, [src], pstep
+
+ bne MBVnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+c0x01010101 DCD 0x01010101
+c0x7F7F7F7F DCD 0x7F7F7F7F
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/recon_v6.asm
@@ -1,0 +1,281 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon_b_armv6|
+ EXPORT |vp8_recon2b_armv6|
+ EXPORT |vp8_recon4b_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+prd RN r0
+dif RN r1
+dst RN r2
+stride RN r3
+
+;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
+; R0 char* pred_ptr
+; R1 short * dif_ptr
+; R2 char * dst_ptr
+; R3 int stride
+
+; Description:
+; Loop through the block adding the Pred and Diff together. Clamp and then
+; store back into the Dst.
+
+; Restrictions :
+; all buffers are expected to be 4 byte aligned coming in and
+; going out.
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_recon_b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #8] ; 1 | 0
+;; ldr r7, [dif, #12] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #16] ; 1 | 0
+;; ldr r7, [dif, #20] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ add dif, dif, #32
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ;0, 1, 2, 3
+ ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
+;; ldr r6, [dif, #24] ; 1 | 0
+;; ldr r7, [dif, #28] ; 3 | 2
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst], stride
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |recon_b|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char *pred_ptr
+; R1 short *dif_ptr
+; R2 char *dst_ptr
+; R3 int stride
+|vp8_recon4b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ mov lr, #4
+
+recon4b_loop
+ ;0, 1, 2, 3
+ ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
+ ldr r6, [dif, #0] ; 1 | 0
+ ldr r7, [dif, #4] ; 3 | 2
+
+ pkhbt r8, r6, r7, lsl #16 ; 2 | 0
+ pkhtb r9, r7, r6, asr #16 ; 3 | 1
+
+ uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
+ uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
+
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst]
+
+ ;4, 5, 6, 7
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #32]
+;; ldr r7, [dif, #36]
+ ldr r6, [dif, #8]
+ ldr r7, [dif, #12]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #4]
+
+ ;8, 9, 10, 11
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #64]
+;; ldr r7, [dif, #68]
+ ldr r6, [dif, #16]
+ ldr r7, [dif, #20]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #8]
+
+ ;12, 13, 14, 15
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #96]
+;; ldr r7, [dif, #100]
+ ldr r6, [dif, #24]
+ ldr r7, [dif, #28]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #12]
+
+ add dst, dst, stride
+;; add dif, dif, #8
+ add dif, dif, #32
+
+ subs lr, lr, #1
+ bne recon4b_loop
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |Recon4B|
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+;
+;
+;
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+; R0 char *pred_ptr
+; R1 short *dif_ptr
+; R2 char *dst_ptr
+; R3 int stride
+|vp8_recon2b_armv6| PROC
+ stmdb sp!, {r4 - r9, lr}
+
+ mov lr, #4
+
+recon2b_loop
+ ;0, 1, 2, 3
+ ldr r4, [prd], #4
+ ldr r6, [dif, #0]
+ ldr r7, [dif, #4]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst]
+
+ ;4, 5, 6, 7
+ ldr r4, [prd], #4
+;; ldr r6, [dif, #32]
+;; ldr r7, [dif, #36]
+ ldr r6, [dif, #8]
+ ldr r7, [dif, #12]
+
+ pkhbt r8, r6, r7, lsl #16
+ pkhtb r9, r7, r6, asr #16
+
+ uxtab16 r8, r8, r4
+ uxtab16 r9, r9, r4, ror #8
+ usat16 r8, #8, r8
+ usat16 r9, #8, r9
+ orr r8, r8, r9, lsl #8
+
+ str r8, [dst, #4]
+
+ add dst, dst, stride
+;; add dif, dif, #8
+ add dif, dif, #16
+
+ subs lr, lr, #1
+ bne recon2b_loop
+
+ ldmia sp!, {r4 - r9, pc}
+
+ ENDP ; |Recon2B|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/simpleloopfilter_v6.asm
@@ -1,0 +1,286 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
+ EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+
+src RN r0
+pstep RN r1
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrb r12, [r2] ; blimit
+ ldr r3, [src, -pstep, lsl #1] ; p1
+ ldr r4, [src, -pstep] ; p0
+ ldr r5, [src] ; q0
+ ldr r6, [src, pstep] ; q1
+ orr r12, r12, r12, lsl #8 ; blimit
+ ldr r2, c0x80808080
+ orr r12, r12, r12, lsl #16 ; blimit
+ mov r9, #4 ; double the count. we're doing 4 at a time
+ mov lr, #0 ; need 0 in a couple places
+
+|simple_hnext8|
+ ; vp8_simple_filter_mask()
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r10, r4, r5 ; p0 - q0
+ uqsub8 r11, r5, r4 ; q0 - p0
+ orr r8, r8, r7 ; abs(p1 - q1)
+ orr r10, r10, r11 ; abs(p0 - q0)
+ uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
+ uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
+ uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ mvn r8, #0
+ usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
+ sel r10, r8, lr ; filter mask: F or 0
+ cmp r10, #0
+ beq simple_hskip_filter ; skip filtering if all masks are 0x00
+
+ ;vp8_simple_filter()
+
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
+ qadd8 r3, r3, r6 ; += q0 - p0
+ ldr r7, c0x04040404
+ qadd8 r3, r3, r6 ; += q0 - p0
+ ldr r8, c0x03030303
+ qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
+ and r3, r3, r10 ; vp9_filter &= mask
+
+ qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4
+ qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3
+
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr ; Filter1 >>= 3
+ shadd8 r8 , r8 , lr ; Filter2 >>= 3
+
+ qsub8 r5 ,r5, r7 ; u = q0 - Filter1
+ qadd8 r4, r4, r8 ; u = p0 + Filter2
+ eor r5, r5, r2 ; *oq0 = u^0x80
+ str r5, [src] ; store oq0 result
+ eor r4, r4, r2 ; *op0 = u^0x80
+ str r4, [src, -pstep] ; store op0 result
+
+|simple_hskip_filter|
+ subs r9, r9, #1
+ addne src, src, #4 ; next row
+
+ ldrne r3, [src, -pstep, lsl #1] ; p1
+ ldrne r4, [src, -pstep] ; p0
+ ldrne r5, [src] ; q0
+ ldrne r6, [src, pstep] ; q1
+
+ bne simple_hnext8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp9_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrb r12, [r2] ; r12: blimit
+ ldr r2, c0x80808080
+ orr r12, r12, r12, lsl #8
+
+ ; load soure data to r7, r8, r9, r10
+ ldrh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
+ ldrh r4, [src], pstep
+ orr r12, r12, r12, lsl #16
+
+ ldrh r5, [src, #-2]
+ pld [src, #23]
+ ldrh r6, [src], pstep
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrh r3, [src, #-2]
+ pld [src, #23]
+ ldrh r4, [src], pstep
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrh r5, [src, #-2]
+ pld [src, #23]
+ ldrh r6, [src], pstep
+ mov r11, #4 ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+ ; vp8_simple_filter_mask() function
+ pkhbt r9, r3, r4, lsl #16
+ pkhbt r10, r5, r6, lsl #16
+
+ ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+ TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r9, r4, r5 ; p0 - q0
+ uqsub8 r10, r5, r4 ; q0 - p0
+ orr r7, r7, r8 ; abs(p1 - q1)
+ orr r9, r9, r10 ; abs(p0 - q0)
+ mov r8, #0
+ uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
+ uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
+ uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ mvn r10, #0 ; r10 == -1
+
+ usub8 r7, r12, r7 ; compare to flimit
+ sel lr, r10, r8 ; filter mask
+
+ cmp lr, #0
+ beq simple_vskip_filter ; skip filtering
+
+ ;vp8_simple_filter() function
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
+
+ qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
+ ldr r9, c0x03030303 ; r9 = 3
+
+ qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
+ ldr r7, c0x04040404
+
+ qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
+ and r3, r3, lr ; vp9_filter &= mask
+
+ qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3
+ qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4
+
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8 ; Filter2 >>= 3
+ shadd8 r3 , r3 , r8 ; Filter1 >>= 3
+
+ ;calculate output
+ sub src, src, pstep, lsl #2
+
+ qadd8 r4, r4, r9 ; u = p0 + Filter2
+ qsub8 r5, r5, r3 ; u = q0 - Filter1
+ eor r4, r4, r2 ; *op0 = u^0x80
+ eor r5, r5, r2 ; *oq0 = u^0x80
+
+ strb r4, [src, #-1] ; store the result
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ strb r5, [src], pstep
+
+|simple_vskip_filter|
+ subs r11, r11, #1
+
+ ; load soure data to r7, r8, r9, r10
+ ldrneh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
+ ldrneh r4, [src], pstep
+
+ ldrneh r5, [src, #-2]
+ pld [src, #23]
+ ldrneh r6, [src], pstep
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrneh r3, [src, #-2]
+ pld [src, #23]
+ ldrneh r4, [src], pstep
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrneh r5, [src, #-2]
+ pld [src, #23]
+ ldrneh r6, [src], pstep
+
+ bne simple_vnext8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+
+ END
--- /dev/null
+++ b/vp9/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -1,0 +1,273 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+ str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ add lr, sp, #4 ;point to temporary buffer
+ beq skip_firstpass_filter
+
+;first-pass filter
+ adr r12, filter8_coeff
+ sub r0, r0, r1, lsl #1
+
+ add r3, r1, #10 ; preload next low
+ pld [r0, r3]
+
+ add r2, r12, r2, lsl #4 ;calculate filter location
+ add r0, r0, #3 ;adjust src only for loading convinience
+
+ ldr r3, [r2] ; load up packed filter coefficients
+ ldr r4, [r2, #4]
+ ldr r5, [r2, #8]
+
+ mov r2, #0x90000 ; height=9 is top part of counter
+
+ sub r1, r1, #8
+
+|first_pass_hloop_v6|
+ ldrb r6, [r0, #-5] ; load source data
+ ldrb r7, [r0, #-4]
+ ldrb r8, [r0, #-3]
+ ldrb r9, [r0, #-2]
+ ldrb r10, [r0, #-1]
+
+ orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
+
+ pkhbt r6, r6, r7, lsl #16 ; r7 | r6
+ pkhbt r7, r7, r8, lsl #16 ; r8 | r7
+
+ pkhbt r8, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+
+|first_pass_wloop_v6|
+ smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1]
+ smuad r12, r7, r3
+
+ ldrb r6, [r0], #1
+
+ smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3]
+ ldrb r7, [r0], #1
+ smlad r12, r9, r4, r12
+
+ pkhbt r10, r10, r6, lsl #16 ; r10 | r9
+ pkhbt r6, r6, r7, lsl #16 ; r11 | r10
+ smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5]
+ smlad r12, r6, r5, r12
+
+ sub r2, r2, #1
+
+ add r11, r11, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff ; test loop counter
+ usat r11, #8, r11, asr #7
+ add r12, r12, #0x40
+ strh r11, [lr], #20 ; result is transposed and stored, which
+ usat r12, #8, r12, asr #7
+
+ strh r12, [lr], #20
+
+ movne r11, r6
+ movne r12, r7
+
+ movne r6, r8
+ movne r7, r9
+ movne r8, r10
+ movne r9, r11
+ movne r10, r12
+
+ bne first_pass_wloop_v6
+
+ ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [src, ppl]
+ ;;pld [src, r9]
+ ;;ENDIF
+
+ subs r2, r2, #0x10000
+
+ sub lr, lr, #158
+
+ add r0, r0, r1 ; move to next input line
+
+ add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
+ pld [r0, r11]
+
+ bne first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+ ldr r3, [sp], #4 ; load back yoffset
+ ldr r0, [sp, #216] ; load dst address from stack 180+36
+ ldr r1, [sp, #220] ; load dst stride from stack 180+40
+
+ cmp r3, #0
+ beq skip_secondpass_filter
+
+ adr r12, filter8_coeff
+ add lr, r12, r3, lsl #4 ;calculate filter location
+
+ mov r2, #0x00080000
+
+ ldr r3, [lr] ; load up packed filter coefficients
+ ldr r4, [lr, #4]
+ ldr r5, [lr, #8]
+
+ pkhbt r12, r4, r3 ; pack the filter differently
+ pkhbt r11, r5, r4
+
+second_pass_hloop_v6
+ ldr r6, [sp] ; load the data
+ ldr r7, [sp, #4]
+
+ orr r2, r2, #2 ; loop counter
+
+second_pass_wloop_v6
+ smuad lr, r3, r6 ; apply filter
+ smulbt r10, r3, r6
+
+ ldr r8, [sp, #8]
+
+ smlad lr, r4, r7, lr
+ smladx r10, r12, r7, r10
+
+ ldrh r9, [sp, #12]
+
+ smlad lr, r5, r8, lr
+ smladx r10, r11, r8, r10
+
+ add sp, sp, #4
+ smlatb r10, r5, r9, r10
+
+ sub r2, r2, #1
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r0], r1 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ strb r10, [r0],r1
+
+ movne r6, r7
+ movne r7, r8
+
+ bne second_pass_wloop_v6
+
+ subs r2, r2, #0x10000
+ add sp, sp, #12 ; updata src for next loop (20-8)
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne second_pass_hloop_v6
+
+ add sp, sp, #20
+ ldmia sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+ sub r0, r0, r1, lsl #1
+ sub r1, r1, #8
+ mov r2, #9
+
+skip_firstpass_hloop
+ ldrb r4, [r0], #1 ; load data
+ subs r2, r2, #1
+ ldrb r5, [r0], #1
+ strh r4, [lr], #20 ; store it to immediate buffer
+ ldrb r6, [r0], #1 ; load data
+ strh r5, [lr], #20
+ ldrb r7, [r0], #1
+ strh r6, [lr], #20
+ ldrb r8, [r0], #1
+ strh r7, [lr], #20
+ ldrb r9, [r0], #1
+ strh r8, [lr], #20
+ ldrb r10, [r0], #1
+ strh r9, [lr], #20
+ ldrb r11, [r0], #1
+ strh r10, [lr], #20
+ add r0, r0, r1 ; move to next input line
+ strh r11, [lr], #20
+
+ sub lr, lr, #158 ; move over to next column
+ bne skip_firstpass_hloop
+
+ b secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+ mov r2, #8
+ add sp, sp, #4 ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+ ldr r6, [sp], #4
+ subs r2, r2, #1
+ ldr r8, [sp], #4
+
+ mov r7, r6, lsr #16 ; unpack
+ strb r6, [r0], r1
+ mov r9, r8, lsr #16
+ strb r7, [r0], r1
+ add sp, sp, #12 ; 20-8
+ strb r8, [r0], r1
+ strb r9, [r0], r1
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne skip_secondpass_hloop
+
+ add sp, sp, #16 ; 180 - (160 +4)
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+ DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
+ DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
+ DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
+ DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
+ DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
+ DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
+ DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
+ DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
+
+ ;DCD 0, 0, 128, 0, 0, 0
+ ;DCD 0, -6, 123, 12, -1, 0
+ ;DCD 2, -11, 108, 36, -8, 1
+ ;DCD 0, -9, 93, 50, -6, 0
+ ;DCD 3, -16, 77, 77, -16, 3
+ ;DCD 0, -6, 50, 93, -9, 0
+ ;DCD 1, -8, 36, 108, -11, 2
+ ;DCD 0, -1, 12, 123, -6, 0
+
+ END
--- /dev/null
+++ b/vp9/common/arm/bilinearfilter_arm.c
@@ -1,0 +1,108 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vp9/common/filter.h"
+#include "vp9/common/subpixel.h"
+#include "bilinearfilter_arm.h"
+
+void vp9_filter_block2d_bil_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height
+) {
+ unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp9_bilinear_predict4x4_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict8x8_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp9_bilinear_predict8x4_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp9_bilinear_predict16x16_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
--- /dev/null
+++ b/vp9/common/arm/bilinearfilter_arm.h
@@ -1,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp9_filter_block2d_bil_first_pass_armv6
+(
+ const unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp9_filter
+);
+
+extern void vp9_filter_block2d_bil_second_pass_armv6
+(
+ const unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp9_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- /dev/null
+++ b/vp9/common/arm/filter_arm.c
@@ -1,0 +1,198 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <math.h>
+#include "vp9/common/filter.h"
+#include "vp9/common/subpixel.h"
+#include "vpx_ports/mem.h"
+
+extern void vp9_filter_block2d_first_pass_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp9_filter
+);
+
+// 8x8
+extern void vp9_filter_block2d_first_pass_8x8_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp9_filter
+);
+
+// 16x16
+extern void vp9_filter_block2d_first_pass_16x16_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp9_filter
+);
+
+extern void vp9_filter_block2d_second_pass_armv6
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int cnt,
+ const short *vp9_filter
+);
+
+extern void vp9_filter4_block2d_second_pass_armv6
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int cnt,
+ const short *vp9_filter
+);
+
+extern void vp9_filter_block2d_first_pass_only_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int cnt,
+ unsigned int output_pitch,
+ const short *vp9_filter
+);
+
+
+extern void vp9_filter_block2d_second_pass_only_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int cnt,
+ unsigned int output_pitch,
+ const short *vp9_filter
+);
+
+#if HAVE_ARMV6
+void vp9_sixtap_predict_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
+
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* Vfilter is null. First pass only */
+ if (xoffset && !yoffset) {
+ /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+ vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+ vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset) {
+ vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+ } else {
+ /* Vfilter is a 4 tap filter */
+ if (yoffset & 0x1) {
+ vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+ vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
+ /* Vfilter is 6 tap filter */
+ else {
+ vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+ vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
+ }
+}
+
+void vp9_sixtap_predict8x8_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ if (xoffset && !yoffset) {
+ vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset) {
+ vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+ } else {
+ if (yoffset & 0x1) {
+ vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+ vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ } else {
+ vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+ vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ }
+ }
+}
+
+
+void vp9_sixtap_predict16x16_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ if (xoffset && !yoffset) {
+ vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset) {
+ vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+ } else {
+ if (yoffset & 0x1) {
+ vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+ vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ } else {
+ vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+ vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ }
+ }
+
+}
+#endif
--- /dev/null
+++ b/vp9/common/arm/idct_arm.h
@@ -1,0 +1,65 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_ARM_H
+#define IDCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_idct(vp9_short_idct4x4llm_1_v6);
+extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
+extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
+
+#undef vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
+
+#undef vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
+
+#undef vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
+
+#undef vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_idct(vp9_short_idct4x4llm_1_neon);
+extern prototype_idct(vp9_short_idct4x4llm_neon);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
+extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
+
+#undef vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
+
+#undef vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
+
+#undef vp8_idct_iwalsh1
+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
+
+#undef vp8_idct_iwalsh16
+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/loopfilter_arm.c
@@ -1,0 +1,166 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/onyxc_int.h"
+
+#if HAVE_ARMV6
+extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh,
+ unsigned char *v);
+
+extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
+
+extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_ARMV6
+/*ARMV6 loopfilter functions*/
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_ARMV7
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+ vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+ vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+ vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+ vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
--- /dev/null
+++ b/vp9/common/arm/loopfilter_arm.h
@@ -1,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef LOOPFILTER_ARM_H
+#define LOOPFILTER_ARM_H
+
+#include "vpx_config.h"
+
+#if HAVE_ARMV6
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
+
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -1,0 +1,357 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict16x16_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, bifilter16_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r4], r5 ;store result
+ vst1.u8 {d4, d5}, [r4], r5
+ vst1.u8 {d6, d7}, [r4], r5
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r4], r5
+
+ bne filt_blk2d_sp16x16_loop_neon
+
+ add sp, sp, #272
+
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r4], r5 ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r4], r5
+ vst1.u8 {d18, d19}, [r4], r5
+ vst1.u8 {d20, d21}, [r4], r5
+
+ bne filt_blk2d_fpo16x16_loop_neon
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r4], r5 ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r4], r5
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r4], r5
+ vst1.u8 {d8, d9}, [r4], r5
+
+ bne filt_blk2d_spo16x16_loop_neon
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+bifilter16_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -1,0 +1,130 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict4x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+ vld1.u8 {d2}, [r0], r1 ;load src data
+ add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+ vld1.u8 {d3}, [r0], r1
+ vld1.u32 {d31}, [r2] ;first_pass filter
+
+ vld1.u8 {d4}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
+ vld1.u8 {d5}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {d6}, [r0], r1
+
+ vshr.u64 q4, q1, #8 ;construct src_ptr[1]
+ vshr.u64 q5, q2, #8
+ vshr.u64 d12, d6, #8
+
+ vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d4, d5
+ vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+
+ vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1])
+ vmlal.u8 q8, d10, d1
+ vmlal.u8 q9, d12, d1
+
+ vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d29, q8, #7
+ vqrshrn.u16 d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3 ;calculate Vfilter location
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d28, d0
+ vmull.u8 q2, d29, d0
+
+ vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
+ vext.8 d27, d29, d30, #4
+
+ vmlal.u8 q1, d26, d1
+ vmlal.u8 q2, d27, d1
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+
+ vst1.32 {d2[0]}, [r4] ;store result
+ vst1.32 {d2[1]}, [r0]
+ vst1.32 {d3[0]}, [r1]
+ vst1.32 {d3[1]}, [r2]
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+ vld1.32 {d28[0]}, [r0], r1 ;load src data
+ vld1.32 {d28[1]}, [r0], r1
+ vld1.32 {d29[0]}, [r0], r1
+ vld1.32 {d29[1]}, [r0], r1
+ vld1.32 {d30[0]}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.32 {d28[0]}, [r4], lr ;store result
+ vst1.32 {d28[1]}, [r4], lr
+ vst1.32 {d29[0]}, [r4], lr
+ vst1.32 {d29[1]}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter4_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -1,0 +1,135 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict8x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter8x4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vld1.u8 {q5}, [r0], r1
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q7, #7
+ vqrshrn.u16 d24, q8, #7
+ vqrshrn.u16 d25, q9, #7
+ vqrshrn.u16 d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3
+ add r0, r4, lr
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ add r1, r0, lr
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+
+ add r2, r1, lr
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+
+ vst1.u8 {d2}, [r4] ;store result
+ vst1.u8 {d3}, [r0]
+ vst1.u8 {d4}, [r1]
+ vst1.u8 {d5}, [r2]
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.u8 {d22}, [r4], lr ;store result
+ vst1.u8 {d23}, [r4], lr
+ vst1.u8 {d24}, [r4], lr
+ vst1.u8 {d25}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter8x4_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -1,0 +1,183 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter8_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3
+ add r0, r4, lr
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ add r1, r0, lr
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2}, [r4] ;store result
+ vst1.u8 {d3}, [r0]
+ vst1.u8 {d4}, [r1], lr
+ vst1.u8 {d5}, [r1], lr
+ vst1.u8 {d6}, [r1], lr
+ vst1.u8 {d7}, [r1], lr
+ vst1.u8 {d8}, [r1], lr
+ vst1.u8 {d9}, [r1], lr
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.u8 {d22}, [r4], lr ;store result
+ vst1.u8 {d23}, [r4], lr
+ vst1.u8 {d24}, [r4], lr
+ vst1.u8 {d25}, [r4], lr
+ vst1.u8 {d26}, [r4], lr
+ vst1.u8 {d27}, [r4], lr
+ vst1.u8 {d28}, [r4], lr
+ vst1.u8 {d29}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter8_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -1,0 +1,584 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_build_intra_predictors_mby_neon_func|
+ EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *y_buffer
+; r1 unsigned char *ypred_ptr
+; r2 int y_stride
+; r3 int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+ push {r4-r8, lr}
+
+ cmp r3, #0
+ beq case_dc_pred
+ cmp r3, #1
+ beq case_v_pred
+ cmp r3, #2
+ beq case_h_pred
+ cmp r3, #3
+ beq case_tm_pred
+
+case_dc_pred
+ ldr r4, [sp, #24] ; Up
+ ldr r5, [sp, #28] ; Left
+
+ ; Default the DC average to 128
+ mov r12, #128
+ vdup.u8 q0, r12
+
+ ; Zero out running sum
+ mov r12, #0
+
+ ; compute shift and jump
+ adds r7, r4, r5
+ beq skip_dc_pred_up_left
+
+ ; Load above row, if it exists
+ cmp r4, #0
+ beq skip_dc_pred_up
+
+ sub r6, r0, r2
+ vld1.8 {q1}, [r6]
+ vpaddl.u8 q2, q1
+ vpaddl.u16 q3, q2
+ vpaddl.u32 q4, q3
+
+ vmov.32 r4, d8[0]
+ vmov.32 r6, d9[0]
+
+ add r12, r4, r6
+
+ ; Move back to interger registers
+
+skip_dc_pred_up
+
+ cmp r5, #0
+ beq skip_dc_pred_left
+
+ sub r0, r0, #1
+
+ ; Load left row, if it exists
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0]
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+skip_dc_pred_left
+ add r7, r7, #3 ; Shift
+ sub r4, r7, #1
+ mov r5, #1
+ add r12, r12, r5, lsl r4
+ mov r5, r12, lsr r7 ; expected_dc
+
+ vdup.u8 q0, r5
+
+skip_dc_pred_up_left
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+
+ pop {r4-r8,pc}
+case_v_pred
+ ; Copy down above row
+ sub r6, r0, r2
+ vld1.8 {q0}, [r6]
+
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ pop {r4-r8,pc}
+
+case_h_pred
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ pop {r4-r8,pc}
+
+case_tm_pred
+ ; Load yabove_row
+ sub r3, r0, r2
+ vld1.8 {q8}, [r3]
+
+ ; Load ytop_left
+ sub r3, r3, #1
+ ldrb r7, [r3]
+
+ vdup.u16 q7, r7
+
+ ; Compute yabove_row - ytop_left
+ mov r3, #1
+ vdup.u8 q0, r3
+
+ vmull.u8 q4, d16, d0
+ vmull.u8 q5, d17, d0
+
+ vsub.s16 q4, q4, q7
+ vsub.s16 q5, q5, q7
+
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+ mov r12, #4
+
+case_tm_pred_loop
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u16 q0, r3
+ vdup.u16 q1, r4
+ vdup.u16 q2, r5
+ vdup.u16 q3, r6
+
+ vqadd.s16 q8, q0, q4
+ vqadd.s16 q9, q0, q5
+
+ vqadd.s16 q10, q1, q4
+ vqadd.s16 q11, q1, q5
+
+ vqadd.s16 q12, q2, q4
+ vqadd.s16 q13, q2, q5
+
+ vqadd.s16 q14, q3, q4
+ vqadd.s16 q15, q3, q5
+
+ vqshrun.s16 d0, q8, #0
+ vqshrun.s16 d1, q9, #0
+
+ vqshrun.s16 d2, q10, #0
+ vqshrun.s16 d3, q11, #0
+
+ vqshrun.s16 d4, q12, #0
+ vqshrun.s16 d5, q13, #0
+
+ vqshrun.s16 d6, q14, #0
+ vqshrun.s16 d7, q15, #0
+
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ subs r12, r12, #1
+ bne case_tm_pred_loop
+
+ pop {r4-r8,pc}
+
+ ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0 unsigned char *y_buffer
+; r1 unsigned char *ypred_ptr
+; r2 int y_stride
+; r3 int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+ push {r4-r8, lr}
+
+ mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+ cmp r3, #0
+ beq case_dc_pred_s
+ cmp r3, #1
+ beq case_v_pred_s
+ cmp r3, #2
+ beq case_h_pred_s
+ cmp r3, #3
+ beq case_tm_pred_s
+
+case_dc_pred_s
+ ldr r4, [sp, #24] ; Up
+ ldr r5, [sp, #28] ; Left
+
+ ; Default the DC average to 128
+ mov r12, #128
+ vdup.u8 q0, r12
+
+ ; Zero out running sum
+ mov r12, #0
+
+ ; compute shift and jump
+ adds r7, r4, r5
+ beq skip_dc_pred_up_left_s
+
+ ; Load above row, if it exists
+ cmp r4, #0
+ beq skip_dc_pred_up_s
+
+ sub r6, r0, r2
+ vld1.8 {q1}, [r6]
+ vpaddl.u8 q2, q1
+ vpaddl.u16 q3, q2
+ vpaddl.u32 q4, q3
+
+ vmov.32 r4, d8[0]
+ vmov.32 r6, d9[0]
+
+ add r12, r4, r6
+
+ ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+ cmp r5, #0
+ beq skip_dc_pred_left_s
+
+ sub r0, r0, #1
+
+ ; Load left row, if it exists
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0]
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+skip_dc_pred_left_s
+ add r7, r7, #3 ; Shift
+ sub r4, r7, #1
+ mov r5, #1
+ add r12, r12, r5, lsl r4
+ mov r5, r12, lsr r7 ; expected_dc
+
+ vdup.u8 q0, r5
+
+skip_dc_pred_up_left_s
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+
+ pop {r4-r8,pc}
+case_v_pred_s
+ ; Copy down above row
+ sub r6, r0, r2
+ vld1.8 {q0}, [r6]
+
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ pop {r4-r8,pc}
+
+case_h_pred_s
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ pop {r4-r8,pc}
+
+case_tm_pred_s
+ ; Load yabove_row
+ sub r3, r0, r2
+ vld1.8 {q8}, [r3]
+
+ ; Load ytop_left
+ sub r3, r3, #1
+ ldrb r7, [r3]
+
+ vdup.u16 q7, r7
+
+ ; Compute yabove_row - ytop_left
+ mov r3, #1
+ vdup.u8 q0, r3
+
+ vmull.u8 q4, d16, d0
+ vmull.u8 q5, d17, d0
+
+ vsub.s16 q4, q4, q7
+ vsub.s16 q5, q5, q7
+
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+ mov r12, #4
+
+case_tm_pred_loop_s
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u16 q0, r3
+ vdup.u16 q1, r4
+ vdup.u16 q2, r5
+ vdup.u16 q3, r6
+
+ vqadd.s16 q8, q0, q4
+ vqadd.s16 q9, q0, q5
+
+ vqadd.s16 q10, q1, q4
+ vqadd.s16 q11, q1, q5
+
+ vqadd.s16 q12, q2, q4
+ vqadd.s16 q13, q2, q5
+
+ vqadd.s16 q14, q3, q4
+ vqadd.s16 q15, q3, q5
+
+ vqshrun.s16 d0, q8, #0
+ vqshrun.s16 d1, q9, #0
+
+ vqshrun.s16 d2, q10, #0
+ vqshrun.s16 d3, q11, #0
+
+ vqshrun.s16 d4, q12, #0
+ vqshrun.s16 d5, q13, #0
+
+ vqshrun.s16 d6, q14, #0
+ vqshrun.s16 d7, q15, #0
+
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ subs r12, r12, #1
+ bne case_tm_pred_loop_s
+
+ pop {r4-r8,pc}
+
+ ENDP
+
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem16x16_neon.asm
@@ -1,0 +1,59 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem16x16_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem16x16_neon| PROC
+
+ vld1.u8 {q0}, [r0], r1
+ vld1.u8 {q1}, [r0], r1
+ vld1.u8 {q2}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ vld1.u8 {q3}, [r0], r1
+ vst1.u8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {q2}, [r2], r3
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {q4}, [r2], r3
+ vld1.u8 {q7}, [r0], r1
+ vst1.u8 {q5}, [r2], r3
+ vld1.u8 {q8}, [r0], r1
+ vst1.u8 {q6}, [r2], r3
+ vld1.u8 {q9}, [r0], r1
+ vst1.u8 {q7}, [r2], r3
+ vld1.u8 {q10}, [r0], r1
+ vst1.u8 {q8}, [r2], r3
+ vld1.u8 {q11}, [r0], r1
+ vst1.u8 {q9}, [r2], r3
+ vld1.u8 {q12}, [r0], r1
+ vst1.u8 {q10}, [r2], r3
+ vld1.u8 {q13}, [r0], r1
+ vst1.u8 {q11}, [r2], r3
+ vld1.u8 {q14}, [r0], r1
+ vst1.u8 {q12}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+ vst1.u8 {q13}, [r2], r3
+ vst1.u8 {q14}, [r2], r3
+ vst1.u8 {q15}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem16x16_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem8x4_neon.asm
@@ -1,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem8x4_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x4_neon| PROC
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r0], r1
+ vst1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r0], r1
+ vst1.u8 {d2}, [r2], r3
+ vst1.u8 {d3}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem8x4_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/copymem8x8_neon.asm
@@ -1,0 +1,43 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_copy_mem8x8_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp9_copy_mem8x8_neon| PROC
+
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r0], r1
+ vst1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r0], r1
+ vst1.u8 {d2}, [r2], r3
+ vld1.u8 {d4}, [r0], r1
+ vst1.u8 {d3}, [r2], r3
+ vld1.u8 {d5}, [r0], r1
+ vst1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r0], r1
+ vst1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r0], r1
+ vst1.u8 {d6}, [r2], r3
+ vst1.u8 {d7}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp9_copy_mem8x8_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/dc_only_idct_add_neon.asm
@@ -1,0 +1,49 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dc_only_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
+; unsigned char *dst_ptr, int pitch, int stride)
+; r0 input_dc
+; r1 pred_ptr
+; r2 dst_ptr
+; r3 pitch
+; sp stride
+|vp8_dc_only_idct_add_neon| PROC
+ add r0, r0, #4
+ asr r0, r0, #3
+ ldr r12, [sp]
+ vdup.16 q0, r0
+
+ vld1.32 {d2[0]}, [r1], r3
+ vld1.32 {d2[1]}, [r1], r3
+ vld1.32 {d4[0]}, [r1], r3
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q1, q0, d2
+ vaddw.u8 q2, q0, d4
+
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+
+ vst1.32 {d2[0]}, [r2], r12
+ vst1.32 {d2[1]}, [r2], r12
+ vst1.32 {d4[0]}, [r2], r12
+ vst1.32 {d4[1]}, [r2]
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/iwalsh_neon.asm
@@ -1,0 +1,80 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+ EXPORT |vp8_short_inv_walsh4x4_neon|
+ EXPORT |vp8_short_inv_walsh4x4_1_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+ ; read in all four lines of values: d0->d3
+ vld1.i16 {q0-q1}, [r0@128]
+
+ ; first for loop
+ vadd.s16 d4, d0, d3 ;a = [0] + [12]
+ vadd.s16 d6, d1, d2 ;b = [4] + [8]
+ vsub.s16 d5, d0, d3 ;d = [0] - [12]
+ vsub.s16 d7, d1, d2 ;c = [4] - [8]
+
+ vadd.s16 q0, q2, q3 ; a+b d+c
+ vsub.s16 q1, q2, q3 ; a-b d-c
+
+ vtrn.32 d0, d2 ;d0: 0 1 8 9
+ ;d2: 2 3 10 11
+ vtrn.32 d1, d3 ;d1: 4 5 12 13
+ ;d3: 6 7 14 15
+
+ vtrn.16 d0, d1 ;d0: 0 4 8 12
+ ;d1: 1 5 9 13
+ vtrn.16 d2, d3 ;d2: 2 6 10 14
+ ;d3: 3 7 11 15
+
+ ; second for loop
+
+ vadd.s16 d4, d0, d3 ;a = [0] + [3]
+ vadd.s16 d6, d1, d2 ;b = [1] + [2]
+ vsub.s16 d5, d0, d3 ;d = [0] - [3]
+ vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+ vmov.i16 q8, #3
+
+ vadd.s16 q0, q2, q3 ; a+b d+c
+ vsub.s16 q1, q2, q3 ; a-b d-c
+
+ vadd.i16 q0, q0, q8 ;e/f += 3
+ vadd.i16 q1, q1, q8 ;g/h += 3
+
+ vshr.s16 q0, q0, #3 ;e/f >> 3
+ vshr.s16 q1, q1, #3 ;g/h >> 3
+
+ vst4.i16 {d0,d1,d2,d3}, [r1@128]
+
+ bx lr
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
+
+
+;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
+|vp8_short_inv_walsh4x4_1_neon| PROC
+ ldrsh r2, [r0] ; load input[0]
+ add r3, r2, #3 ; add 3
+ add r2, r1, #16 ; base for last 8 output
+ asr r0, r3, #3 ; right shift 3
+ vdup.16 q0, r0 ; load and duplicate
+ vst1.16 {q0}, [r1@128] ; write back 8
+ vst1.16 {q0}, [r2@128] ; write back last 8
+ bx lr
+ ENDP ; |vp8_short_inv_walsh4x4_1_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfilter_neon.asm
@@ -1,0 +1,397 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_loop_filter_horizontal_edge_y_neon|
+ EXPORT |vp9_loop_filter_horizontal_edge_uv_neon|
+ EXPORT |vp9_loop_filter_vertical_edge_y_neon|
+ EXPORT |vp9_loop_filter_vertical_edge_uv_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src
+; r1 int pitch
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp9_loop_filter_horizontal_edge_y_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1
+ add r1, r1, r1
+
+ vdup.u8 q2, r3 ; duplicate thresh
+
+ vld1.u8 {q3}, [r2@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r2@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r2@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r2@128] ; q2
+ vld1.u8 {q10}, [r12@128] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r12, r12, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {q5}, [r2@128], r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r2@128], r1 ; store oq0
+ vst1.u8 {q8}, [r12@128], r1 ; store oq1
+
+ pop {pc}
+ ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
+
+
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+|vp9_loop_filter_horizontal_edge_uv_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ ldr r12, [sp, #4] ; load thresh
+ ldr r2, [sp, #8] ; load v ptr
+ vdup.u8 q2, r12 ; duplicate thresh
+
+ sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
+ sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
+
+ vld1.u8 {d6}, [r3@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r3@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r3@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r3@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r3@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r3@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r3@64] ; q3
+ vld1.u8 {d21}, [r12@64] ; q3
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, r1, lsl #1
+ sub r2, r2, r1, lsl #1
+
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r2@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r2@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r2@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64] ; store u oq1
+ vst1.u8 {d17}, [r2@64] ; store v oq1
+
+ pop {pc}
+ ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon|
+
+; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+; const signed char *flimit,
+; const signed char *limit,
+; const signed char *thresh,
+; int count)
+; r0 unsigned char *src
+; r1 int pitch
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+
+|vp9_loop_filter_vertical_edge_y_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, #4 ; src ptr down by 4 columns
+ add r1, r1, r1
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1, asr #1
+
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d8}, [r12], r1
+ vld1.u8 {d10}, [r2], r1
+ vld1.u8 {d12}, [r12], r1
+ vld1.u8 {d14}, [r2], r1
+ vld1.u8 {d16}, [r12], r1
+ vld1.u8 {d18}, [r2], r1
+ vld1.u8 {d20}, [r12], r1
+
+ vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d11}, [r2], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d15}, [r2], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d19}, [r2]
+ vld1.u8 {d21}, [r12]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vdup.u8 q2, r3 ; duplicate thresh
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ bl vp9_loop_filter_neon
+
+ vswp d12, d11
+ vswp d16, d13
+
+ sub r0, r0, #2 ; dst ptr
+
+ vswp d14, d12
+ vswp d16, d15
+
+ add r12, r0, r1, asr #1
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+ vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+ vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+ vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+ vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+ vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+ vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
+ vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+ pop {pc}
+ ENDP ; |vp9_loop_filter_vertical_edge_y_neon|
+
+; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+; const signed char *flimit,
+; const signed char *limit,
+; const signed char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+|vp9_loop_filter_vertical_edge_uv_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ sub r12, r0, #4 ; move u pointer down by 4 columns
+ ldr r2, [sp, #8] ; load v ptr
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r3, r2, #4 ; move v pointer down by 4 columns
+
+ vld1.u8 {d6}, [r12], r1 ;load u data
+ vld1.u8 {d7}, [r3], r1 ;load v data
+ vld1.u8 {d8}, [r12], r1
+ vld1.u8 {d9}, [r3], r1
+ vld1.u8 {d10}, [r12], r1
+ vld1.u8 {d11}, [r3], r1
+ vld1.u8 {d12}, [r12], r1
+ vld1.u8 {d13}, [r3], r1
+ vld1.u8 {d14}, [r12], r1
+ vld1.u8 {d15}, [r3], r1
+ vld1.u8 {d16}, [r12], r1
+ vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d18}, [r12], r1
+ vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d20}, [r12]
+ vld1.u8 {d21}, [r3]
+
+ ldr r12, [sp, #4] ; load thresh
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vdup.u8 q2, r12 ; duplicate thresh
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ bl vp9_loop_filter_neon
+
+ vswp d12, d11
+ vswp d16, d13
+ vswp d14, d12
+ vswp d16, d15
+
+ sub r0, r0, #2
+ sub r2, r2, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
+ vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+ pop {pc}
+ ENDP ; |vp9_loop_filter_vertical_edge_uv_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0 flimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+|vp9_loop_filter_neon| PROC
+
+ ; vp9_filter_mask
+ vabd.u8 q11, q3, q4 ; abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; abs(q3 - q2)
+
+ vmax.u8 q11, q11, q12
+ vmax.u8 q12, q13, q14
+ vmax.u8 q3, q3, q4
+ vmax.u8 q15, q11, q12
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ ; vp8_hevmask
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3
+
+ vmov.u8 q10, #0x80 ; 0x80
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ vcge.u8 q15, q1, q15
+
+ ; vp9_filter() function
+ ; convert to signed
+ veor q7, q7, q10 ; qs0
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u8 q10, #3 ; #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
+
+ vmovl.u8 q4, d20
+
+ vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; vp8_hevmask
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; vp9_filter &= hev
+ vand q15, q15, q9 ; vp9_filter_mask
+
+ vaddw.s8 q2, q2, d2
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4 ; #4
+
+ ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; vp9_filter &= mask
+
+ vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3)
+ vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; Filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
+ vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
+
+ ; outer tap adjustments: ++vp9_filter >> 1
+ vrshr.s8 q1, q1, #1
+ vbic q1, q1, q14 ; vp9_filter &= ~hev
+ vmov.u8 q0, #0x80 ; 0x80
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+ veor q5, q13, q0 ; *op1 = u^0x80
+ veor q8, q12, q0 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -1,0 +1,117 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon|
+ EXPORT |vp9_loop_filter_bhs_neon|
+ EXPORT |vp9_loop_filter_mbhs_neon|
+ ARM
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
+|vp9_loop_filter_simple_horizontal_edge_neon| PROC
+
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
+
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q5}, [r3@128], r1 ; p0
+ vld1.u8 {q8}, [r0@128] ; q1
+ vld1.u8 {q6}, [r3@128] ; p1
+
+ vabd.u8 q15, q6, q7 ; abs(p0 - q0)
+ vabd.u8 q14, q5, q8 ; abs(p1 - q1)
+
+ vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
+ vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q13, #3
+ vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
+ veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
+ veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
+ veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
+
+ vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q3, d15, d13
+
+ vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
+
+ vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
+ vmul.s16 q3, q3, q13
+
+ vmov.u8 q10, #0x03 ; 0x03
+ vmov.u8 q9, #0x04 ; 0x04
+
+ vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q3, q3, d9
+
+ vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d9, q3
+
+ vand q14, q4, q15 ; vp9_filter &= mask
+
+ vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
+ vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q4, q3, #3 ; Filter1 >>= 3
+
+ sub r0, r0, r1
+
+ ;calculate output
+ vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
+ vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+
+ vst1.u8 {q6}, [r3@128] ; store op0
+ vst1.u8 {q7}, [r0@128] ; store oq0
+
+ bx lr
+ ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp9_loop_filter_bhs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate blim
+
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
+ bl vp9_loop_filter_simple_horizontal_edge_neon
+ ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
+ bl vp9_loop_filter_simple_horizontal_edge_neon
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
+ pop {r4, lr}
+ b vp9_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp9_loop_filter_bhs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp9_loop_filter_mbhs_neon| PROC
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp9_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp9_loop_filter_bhs_neon|
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -1,0 +1,154 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ;EXPORT |vp9_loop_filter_simple_vertical_edge_neon|
+ EXPORT |vp9_loop_filter_bvs_neon|
+ EXPORT |vp9_loop_filter_mbvs_neon|
+ ARM
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
+|vp9_loop_filter_simple_vertical_edge_neon| PROC
+ sub r0, r0, #2 ; move src pointer down by 2 columns
+ add r12, r1, r1
+ add r3, r0, r1
+
+ vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+ vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+ vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+ vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+ vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+ vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+ vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+ vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
+
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+ vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
+
+ vswp d7, d10
+ vswp d12, d9
+
+ ;vp9_filter_mask() function
+ ;vp8_hevmask() function
+ sub r0, r0, r1, lsl #4
+ vabd.u8 q15, q5, q4 ; abs(p0 - q0)
+ vabd.u8 q14, q3, q6 ; abs(p1 - q1)
+
+ vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
+ vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q11, #3
+ vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
+ veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
+ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
+ veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
+
+ vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+ vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
+ vsubl.s8 q13, d9, d11
+
+ vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
+
+ vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+ vmul.s16 q13, q13, q11
+
+ vmov.u8 q11, #0x03 ; 0x03
+ vmov.u8 q12, #0x04 ; 0x04
+
+ vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q13, q13, d29
+
+ vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d29, q13
+
+ add r0, r0, #1
+ add r3, r0, r1
+
+ vand q14, q14, q15 ; vp9_filter &= mask
+
+ vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
+ vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q14, q3, #3 ; Filter1 >>= 3
+
+ ;calculate output
+ vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
+ vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+ add r12, r1, r1
+ vswp d13, d14
+
+ ;store op1, op0, oq0, oq1
+ vst2.8 {d12[0], d13[0]}, [r0], r12
+ vst2.8 {d12[1], d13[1]}, [r3], r12
+ vst2.8 {d12[2], d13[2]}, [r0], r12
+ vst2.8 {d12[3], d13[3]}, [r3], r12
+ vst2.8 {d12[4], d13[4]}, [r0], r12
+ vst2.8 {d12[5], d13[5]}, [r3], r12
+ vst2.8 {d12[6], d13[6]}, [r0], r12
+ vst2.8 {d12[7], d13[7]}, [r3], r12
+ vst2.8 {d14[0], d15[0]}, [r0], r12
+ vst2.8 {d14[1], d15[1]}, [r3], r12
+ vst2.8 {d14[2], d15[2]}, [r0], r12
+ vst2.8 {d14[3], d15[3]}, [r3], r12
+ vst2.8 {d14[4], d15[4]}, [r0], r12
+ vst2.8 {d14[5], d15[5]}, [r3], r12
+ vst2.8 {d14[6], d15[6]}, [r0], r12
+ vst2.8 {d14[7], d15[7]}, [r3]
+
+ bx lr
+ ENDP ; |vp9_loop_filter_simple_vertical_edge_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp9_loop_filter_bvs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ mov r4, r0
+ add r0, r0, #4
+ vdup.s8 q1, r3 ; duplicate blim
+ bl vp9_loop_filter_simple_vertical_edge_neon
+ ; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1
+ add r0, r4, #8
+ bl vp9_loop_filter_simple_vertical_edge_neon
+ add r0, r4, #12
+ pop {r4, lr}
+ b vp9_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp9_loop_filter_bvs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp9_loop_filter_mbvs_neon| PROC
+ ldrb r3, [r2] ; load mblim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp9_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp9_loop_filter_bvs_neon|
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/mbloopfilter_neon.asm
@@ -1,0 +1,469 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
+ EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
+ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
+ EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
+; r0 unsigned char *src,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+ push {lr}
+ add r1, r1, r1 ; double stride
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
+
+ vld1.u8 {q3}, [r0@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r0@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r0@128], r1 ; q2
+ vld1.u8 {q10}, [r12@128], r1 ; q3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #2
+ add r0, r12, r1, lsr #1
+
+ vst1.u8 {q4}, [r12@128],r1 ; store op2
+ vst1.u8 {q5}, [r0@128],r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r0@128],r1 ; store oq0
+ vst1.u8 {q8}, [r12@128] ; store oq1
+ vst1.u8 {q9}, [r0@128] ; store oq2
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
+
+ vld1.u8 {d6}, [r0@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r0@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r0@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r0@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r0@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r0@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r0@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r0@64], r1 ; q3
+ vld1.u8 {d21}, [r12@64], r1 ; q3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r0, r0, r1, lsl #3
+ sub r12, r12, r1, lsl #3
+
+ add r0, r0, r1
+ add r12, r12, r1
+
+ vst1.u8 {d8}, [r0@64], r1 ; store u op2
+ vst1.u8 {d9}, [r12@64], r1 ; store v op2
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r12@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r12@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r12@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64], r1 ; store u oq1
+ vst1.u8 {d17}, [r12@64], r1 ; store v oq1
+ vst1.u8 {d18}, [r0@64], r1 ; store u oq2
+ vst1.u8 {d19}, [r12@64], r1 ; store v oq2
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
+; r0 unsigned char *src,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, #4 ; move src pointer down by 4 columns
+ vdup.s8 q2, r12 ; thresh
+ add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
+ vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
+ vld1.u8 {d8}, [r0], r1
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d10}, [r0], r1
+ vld1.u8 {d11}, [r12], r1
+ vld1.u8 {d12}, [r0], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d14}, [r0], r1
+ vld1.u8 {d15}, [r12], r1
+ vld1.u8 {d16}, [r0], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d18}, [r0], r1
+ vld1.u8 {d19}, [r12], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r12], r1
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ sub r0, r0, r1, lsl #3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #3
+
+ ;transpose to 16x8 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ ;store op2, op1, op0, oq0, oq1, oq2
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r12], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d9}, [r12], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d11}, [r12], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d13}, [r12], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d15}, [r12], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r12], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 const signed char *flimit,
+; r3 const signed char *limit,
+; sp const signed char *thresh,
+; sp+4 unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, #4 ; move u pointer down by 4 columns
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, #4 ; move v pointer down by 4 columns
+
+ vld1.u8 {d6}, [r0], r1 ;load u data
+ vld1.u8 {d7}, [r12], r1 ;load v data
+ vld1.u8 {d8}, [r0], r1
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d10}, [r0], r1
+ vld1.u8 {d11}, [r12], r1
+ vld1.u8 {d12}, [r0], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d14}, [r0], r1
+ vld1.u8 {d15}, [r12], r1
+ vld1.u8 {d16}, [r0], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d18}, [r0], r1
+ vld1.u8 {d19}, [r12], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r12], r1
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ sub r0, r0, r1, lsl #3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #3
+
+ ;transpose to 16x8 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ ;store op2, op1, op0, oq0, oq1, oq2
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r12], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d9}, [r12], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d11}, [r12], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d13}, [r12], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d15}, [r12], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r12], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+; void vp8_mbloop_filter_neon()
+; This is a helper function for the macroblock loopfilters. The individual
+; functions do the necessary load, transpose (if necessary), preserve (if
+; necessary) and store.
+
+; r0,r1 PRESERVE
+; r2 mblimit
+; r3 limit
+
+; q2 thresh
+; q3 p3 PRESERVE
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3 PRESERVE
+
+|vp8_mbloop_filter_neon| PROC
+
+ ; vp9_filter_mask
+ vabd.u8 q11, q3, q4 ; abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; abs(q1 - q0)
+ vabd.u8 q1, q9, q8 ; abs(q2 - q1)
+ vabd.u8 q0, q10, q9 ; abs(q3 - q2)
+
+ vmax.u8 q11, q11, q12
+ vmax.u8 q12, q13, q14
+ vmax.u8 q1, q1, q0
+ vmax.u8 q15, q11, q12
+
+ vabd.u8 q12, q6, q7 ; abs(p0 - q0)
+
+ ; vp8_hevmask
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
+ vmax.u8 q15, q15, q1
+
+ vdup.u8 q1, r3 ; limit
+ vdup.u8 q2, r2 ; mblimit
+
+ vmov.u8 q0, #0x80 ; 0x80
+
+ vcge.u8 q15, q1, q15
+
+ vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
+ vmov.u16 q11, #3 ; #3
+
+ ; vp9_filter
+ ; convert to signed
+ veor q7, q7, q0 ; qs0
+ vshr.u8 q1, q1, #1 ; a = a / 2
+ veor q6, q6, q0 ; ps0
+ veor q5, q5, q0 ; ps1
+
+ vqadd.u8 q12, q12, q1 ; a = b + a
+
+ veor q8, q8, q0 ; qs1
+ veor q4, q4, q0 ; ps2
+ veor q9, q9, q0 ; qs2
+
+ vorr q14, q13, q14 ; vp8_hevmask
+
+ vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
+
+ vsubl.s8 q2, d14, d12 ; qs0 - ps0
+ vsubl.s8 q13, d15, d13
+
+ vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
+
+ vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+
+ vand q15, q15, q12 ; vp9_filter_mask
+
+ vmul.i16 q13, q13, q11
+
+ vmov.u8 q12, #3 ; #3
+
+ vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q13, q13, d3
+
+ vmov.u8 q11, #4 ; #4
+
+ ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q13
+
+ vand q1, q1, q15 ; vp9_filter &= mask
+
+ vmov.u16 q15, #63 ; #63
+
+ vand q13, q1, q14 ; Filter2 &= hev
+
+ vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
+ vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
+
+ vmov q0, q15
+
+ vshr.s8 q2, q2, #3 ; Filter1 >>= 3
+ vshr.s8 q13, q13, #3 ; Filter2 >>= 3
+
+ vmov q11, q15
+ vmov q12, q15
+
+ vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
+
+ vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
+
+ vbic q1, q1, q14 ; vp9_filter &= ~hev
+
+ ; roughly 1/7th difference across boundary
+ ; roughly 2/7th difference across boundary
+ ; roughly 3/7th difference across boundary
+
+ vmov.u8 d5, #9 ; #9
+ vmov.u8 d4, #18 ; #18
+
+ vmov q13, q15
+ vmov q14, q15
+
+ vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
+ vmlal.s8 q11, d3, d5
+ vmov.u8 d5, #27 ; #27
+ vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
+ vmlal.s8 q13, d3, d4
+ vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
+ vmlal.s8 q15, d3, d5
+
+ vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
+ vqshrn.s16 d1, q11, #7
+ vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
+ vqshrn.s16 d25, q13, #7
+ vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
+ vqshrn.s16 d29, q15, #7
+
+ vmov.u8 q1, #0x80 ; 0x80
+
+ vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
+ vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
+ vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
+ vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
+ vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
+ vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
+
+ veor q9, q11, q1 ; *oq2 = s^0x80
+ veor q4, q0, q1 ; *op2 = s^0x80
+ veor q8, q13, q1 ; *oq1 = s^0x80
+ veor q5, q12, q1 ; *op2 = s^0x80
+ veor q7, q15, q1 ; *oq0 = s^0x80
+ veor q6, q14, q1 ; *op0 = s^0x80
+
+ bx lr
+ ENDP ; |vp8_mbloop_filter_neon|
+
+;-----------------
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/recon16x16mb_neon.asm
@@ -1,0 +1,131 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon16x16mb_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *pred_ptr,
+; r1 short *diff_ptr,
+; r2 unsigned char *dst_ptr,
+; r3 int ystride,
+; stack unsigned char *udst_ptr,
+; stack unsigned char *vdst_ptr
+
+|vp8_recon16x16mb_neon| PROC
+ mov r12, #4 ;loop counter for Y loop
+
+recon16x16mb_loop_y
+ vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
+ vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
+ vld1.u8 {q14, q15}, [r0]!
+ vld1.16 {q10, q11}, [r1]!
+
+ vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
+ vmovl.u8 q1, d25
+ vmovl.u8 q2, d26
+ vmovl.u8 q3, d27
+ vmovl.u8 q4, d28
+ vmovl.u8 q5, d29
+ vmovl.u8 q6, d30
+ vld1.16 {q12, q13}, [r1]!
+ vmovl.u8 q7, d31
+ vld1.16 {q14, q15}, [r1]!
+
+ pld [r0]
+ pld [r1]
+ pld [r1, #64]
+
+ vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
+ vadd.s16 q1, q1, q9
+ vadd.s16 q2, q2, q10
+ vadd.s16 q3, q3, q11
+ vadd.s16 q4, q4, q12
+ vadd.s16 q5, q5, q13
+ vadd.s16 q6, q6, q14
+ vadd.s16 q7, q7, q15
+
+ vqmovun.s16 d0, q0 ;CLAMP() saturation
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vqmovun.s16 d4, q4
+ vqmovun.s16 d5, q5
+ vst1.u8 {q0}, [r2], r3 ;store result
+ vqmovun.s16 d6, q6
+ vst1.u8 {q1}, [r2], r3
+ vqmovun.s16 d7, q7
+ vst1.u8 {q2}, [r2], r3
+ subs r12, r12, #1
+
+ moveq r12, #2 ;loop counter for UV loop
+
+ vst1.u8 {q3}, [r2], r3
+ bne recon16x16mb_loop_y
+
+ mov r3, r3, lsr #1 ;uv_stride = ystride>>1
+ ldr r2, [sp] ;load upred_ptr
+
+recon16x16mb_loop_uv
+ vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
+ vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
+ vld1.u8 {q14, q15}, [r0]!
+ vld1.16 {q10, q11}, [r1]!
+
+ vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
+ vmovl.u8 q1, d25
+ vmovl.u8 q2, d26
+ vmovl.u8 q3, d27
+ vmovl.u8 q4, d28
+ vmovl.u8 q5, d29
+ vmovl.u8 q6, d30
+ vld1.16 {q12, q13}, [r1]!
+ vmovl.u8 q7, d31
+ vld1.16 {q14, q15}, [r1]!
+
+ vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
+ vadd.s16 q1, q1, q9
+ vadd.s16 q2, q2, q10
+ vadd.s16 q3, q3, q11
+ vadd.s16 q4, q4, q12
+ vadd.s16 q5, q5, q13
+ vadd.s16 q6, q6, q14
+
+ vqmovun.s16 d0, q0 ;CLAMP() saturation
+ vadd.s16 q7, q7, q15
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst1.u8 {d0}, [r2], r3 ;store result
+ vqmovun.s16 d4, q4
+ vst1.u8 {d1}, [r2], r3
+ vqmovun.s16 d5, q5
+ vst1.u8 {d2}, [r2], r3
+ vqmovun.s16 d6, q6
+ vst1.u8 {d3}, [r2], r3
+ vqmovun.s16 d7, q7
+ vst1.u8 {d4}, [r2], r3
+ subs r12, r12, #1
+
+ vst1.u8 {d5}, [r2], r3
+ vst1.u8 {d6}, [r2], r3
+ vst1.u8 {d7}, [r2], r3
+
+ ldrne r2, [sp, #4] ;load vpred_ptr
+ bne recon16x16mb_loop_uv
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/recon2b_neon.asm
@@ -1,0 +1,54 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon2b_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *pred_ptr,
+; r1 short *diff_ptr,
+; r2 unsigned char *dst_ptr,
+; r3 int stride
+
+|vp8_recon2b_neon| PROC
+ vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
+ vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
+
+ vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
+ vld1.16 {q6, q7}, [r1]!
+ vmovl.u8 q1, d17
+ vmovl.u8 q2, d18
+ vmovl.u8 q3, d19
+
+ vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
+ vadd.s16 q1, q1, q5
+ vadd.s16 q2, q2, q6
+ vadd.s16 q3, q3, q7
+
+ vqmovun.s16 d0, q0 ;CLAMP() saturation
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ add r0, r2, r3
+
+ vst1.u8 {d0}, [r2] ;store result
+ vst1.u8 {d1}, [r0], r3
+ add r2, r0, r3
+ vst1.u8 {d2}, [r0]
+ vst1.u8 {d3}, [r2], r3
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/recon4b_neon.asm
@@ -1,0 +1,69 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon4b_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *pred_ptr,
+; r1 short *diff_ptr,
+; r2 unsigned char *dst_ptr,
+; r3 int stride
+
+|vp8_recon4b_neon| PROC
+ vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
+ vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
+ vld1.u8 {q14, q15}, [r0]
+ vld1.16 {q10, q11}, [r1]!
+
+ vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
+ vmovl.u8 q1, d25
+ vmovl.u8 q2, d26
+ vmovl.u8 q3, d27
+ vmovl.u8 q4, d28
+ vmovl.u8 q5, d29
+ vmovl.u8 q6, d30
+ vld1.16 {q12, q13}, [r1]!
+ vmovl.u8 q7, d31
+ vld1.16 {q14, q15}, [r1]
+
+ vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
+ vadd.s16 q1, q1, q9
+ vadd.s16 q2, q2, q10
+ vadd.s16 q3, q3, q11
+ vadd.s16 q4, q4, q12
+ vadd.s16 q5, q5, q13
+ vadd.s16 q6, q6, q14
+ vadd.s16 q7, q7, q15
+
+ vqmovun.s16 d0, q0 ;CLAMP() saturation
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vqmovun.s16 d4, q4
+ vqmovun.s16 d5, q5
+ vqmovun.s16 d6, q6
+ vqmovun.s16 d7, q7
+ add r0, r2, r3
+
+ vst1.u8 {q0}, [r2] ;store result
+ vst1.u8 {q1}, [r0], r3
+ add r2, r0, r3
+ vst1.u8 {q2}, [r0]
+ vst1.u8 {q3}, [r2], r3
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/recon_neon.c
@@ -1,0 +1,29 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/common/recon.h"
+#include "vp9/common/blockd.h"
+
+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
+
+void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
+ unsigned char *pred_ptr = &xd->predictor[0];
+ short *diff_ptr = &xd->diff[0];
+ unsigned char *dst_ptr = xd->dst.y_buffer;
+ unsigned char *udst_ptr = xd->dst.u_buffer;
+ unsigned char *vdst_ptr = xd->dst.v_buffer;
+ int ystride = xd->dst.y_stride;
+ /*int uv_stride = xd->dst.uv_stride;*/
+
+ vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
+ udst_ptr, vdst_ptr);
+}
--- /dev/null
+++ b/vp9/common/arm/neon/reconb_neon.asm
@@ -1,0 +1,61 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_recon_b_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *pred_ptr,
+; r1 short *diff_ptr,
+; r2 unsigned char *dst_ptr,
+; r3 int stride
+
+|vp8_recon_b_neon| PROC
+ mov r12, #16
+
+ vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
+ vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
+ vld1.u8 {d29}, [r0], r12
+ vld1.16 {q11, q12}, [r1]!
+ vld1.u8 {d30}, [r0], r12
+ vld1.16 {q12, q13}, [r1]!
+ vld1.u8 {d31}, [r0], r12
+ vld1.16 {q13}, [r1]
+
+ vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
+ vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
+ vmovl.u8 q2, d30
+ vmovl.u8 q3, d31
+
+ vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
+ vadd.s16 d2, d2, d22
+ vadd.s16 d4, d4, d24
+ vadd.s16 d6, d6, d26
+
+ vqmovun.s16 d0, q0 ;CLAMP() saturation
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ add r1, r2, r3
+
+ vst1.32 {d0[0]}, [r2] ;store result
+ vst1.32 {d1[0]}, [r1], r3
+ add r2, r1, r3
+ vst1.32 {d2[0]}, [r1]
+ vst1.32 {d3[0]}, [r2], r3
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/save_neon_reg.asm
@@ -1,0 +1,36 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_push_neon|
+ EXPORT |vp9_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp9_push_neon| PROC
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+|vp9_pop_neon| PROC
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+ END
+
--- /dev/null
+++ b/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm
@@ -1,0 +1,67 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_idct4x4llm_1_neon|
+ EXPORT |vp8_dc_only_idct_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+; r0 short *input;
+; r1 short *output;
+; r2 int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_short_idct4x4llm_1_neon| PROC
+ vld1.16 {d0[]}, [r0] ;load input[0]
+
+ add r3, r1, r2
+ add r12, r3, r2
+
+ vrshr.s16 d0, d0, #3
+
+ add r0, r12, r2
+
+ vst1.16 {d0}, [r1]
+ vst1.16 {d0}, [r3]
+ vst1.16 {d0}, [r12]
+ vst1.16 {d0}, [r0]
+
+ bx lr
+ ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+; r0 short input_dc;
+; r1 short *output;
+; r2 int pitch;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp8_dc_only_idct_neon| PROC
+ vdup.16 d0, r0
+
+ add r3, r1, r2
+ add r12, r3, r2
+
+ vrshr.s16 d0, d0, #3
+
+ add r0, r12, r2
+
+ vst1.16 {d0}, [r1]
+ vst1.16 {d0}, [r3]
+ vst1.16 {d0}, [r12]
+ vst1.16 {d0}, [r0]
+
+ bx lr
+
+ ENDP
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/shortidct4x4llm_neon.asm
@@ -1,0 +1,122 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_idct4x4llm_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;*************************************************************
+;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+;r0 short * input
+;r1 short * output
+;r2 int pitch
+;*************************************************************
+;static const int cospi8sqrt2minus1=20091;
+;static const int sinpi8sqrt2 =35468;
+;static const int rounding = 0;
+;Optimization note: The resulted data from dequantization are signed 13-bit data that is
+;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
+;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
+;result of the multiplication that is needed in IDCT.
+
+|vp8_short_idct4x4llm_neon| PROC
+ adr r12, idct_coeff
+ vld1.16 {q1, q2}, [r0]
+ vld1.16 {d0}, [r12]
+
+ vswp d3, d4 ;q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ ;d6 - c1:temp1
+ ;d7 - d1:temp2
+ ;d8 - d1:temp1
+ ;d9 - c1:temp2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vswp d3, d4
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ add r3, r1, r2
+ add r12, r3, r2
+ add r0, r12, r2
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vst1.16 {d2}, [r1]
+ vst1.16 {d3}, [r3]
+ vst1.16 {d4}, [r12]
+ vst1.16 {d5}, [r0]
+
+ bx lr
+
+ ENDP
+
+;-----------------
+
+idct_coeff
+ DCD 0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict16x16_neon.asm
@@ -1,0 +1,490 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict16x16_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
+; the result can be negtive. So, I treat the result as s16. But, since it is also possible
+; that the result can be a large positive number (> 2^15-1), which could be confused as a
+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
+; which ensures that the result stays in s16 range. Finally, saturated add the result by
+; applying 3rd filter coeff. Same applys to other filter functions.
+
+|vp8_sixtap_predict16x16_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter16_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter16x16_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter16x16_only
+
+ sub sp, sp, #336 ;reserve space on stack for temporary storage
+ mov lr, sp
+
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #7 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (21x16)
+filt_blk2d_fp16x16_loop_neon
+ vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
+ vld1.u8 {d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q9, d7, d0
+ vmull.u8 q10, d9, d0
+ vmull.u8 q11, d10, d0
+ vmull.u8 q12, d12, d0
+ vmull.u8 q13, d13, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d9, d10, #1
+ vext.8 d30, d12, d13, #1
+
+ vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q12, d30, d1
+
+ vext.8 d28, d7, d8, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d13, d14, #1
+
+ vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q11, d29, d1
+ vmlsl.u8 q13, d30, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d9, d10, #4
+ vext.8 d30, d12, d13, #4
+
+ vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q12, d30, d4
+
+ vext.8 d28, d7, d8, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d13, d14, #4
+
+ vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q11, d29, d4
+ vmlsl.u8 q13, d30, d4
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d9, d10, #5
+ vext.8 d30, d12, d13, #5
+
+ vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q12, d30, d5
+
+ vext.8 d28, d7, d8, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d13, d14, #5
+
+ vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q11, d29, d5
+ vmlal.u8 q13, d30, d5
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d9, d10, #2
+ vext.8 d30, d12, d13, #2
+
+ vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q12, d30, d2
+
+ vext.8 d28, d7, d8, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d13, d14, #2
+
+ vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q11, d29, d2
+ vmlal.u8 q13, d30, d2
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d9, d10, #3
+ vext.8 d30, d12, d13, #3
+
+ vext.8 d15, d7, d8, #3
+ vext.8 d31, d10, d11, #3
+ vext.8 d6, d13, d14, #3
+
+ vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+
+ vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q10, q5
+ vqadd.s16 q12, q6
+
+ vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q7, d31, d3
+ vmull.u8 q3, d6, d3
+
+ subs r2, r2, #1
+
+ vqadd.s16 q9, q6
+ vqadd.s16 q11, q7
+ vqadd.s16 q13, q3
+
+ vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q9, #7
+ vqrshrun.s16 d8, q10, #7
+ vqrshrun.s16 d9, q11, #7
+ vqrshrun.s16 d10, q12, #7
+ vqrshrun.s16 d11, q13, #7
+
+ vst1.u8 {d6, d7, d8}, [lr]! ;store result
+ vst1.u8 {d9, d10, d11}, [lr]!
+
+ bne filt_blk2d_fp16x16_loop_neon
+
+;Second pass: 16x16
+;secondpass_filter - do first 8-columns and then second 8-columns
+ add r3, r12, r3, lsl #5
+ sub lr, lr, #336
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ mov r3, #2 ;loop counter
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ mov r2, #16
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_sp16x16_outloop_neon
+ vld1.u8 {d18}, [lr], r2 ;load src data
+ vld1.u8 {d19}, [lr], r2
+ vld1.u8 {d20}, [lr], r2
+ vld1.u8 {d21}, [lr], r2
+ mov r12, #4 ;loop counter
+ vld1.u8 {d22}, [lr], r2
+
+secondpass_inner_loop_neon
+ vld1.u8 {d23}, [lr], r2 ;load src data
+ vld1.u8 {d24}, [lr], r2
+ vld1.u8 {d25}, [lr], r2
+ vld1.u8 {d26}, [lr], r2
+
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r12, r12, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q9, q11
+ vst1.u8 {d7}, [r4], r5
+ vmov q10, q12
+ vst1.u8 {d8}, [r4], r5
+ vmov d22, d26
+ vst1.u8 {d9}, [r4], r5
+
+ bne secondpass_inner_loop_neon
+
+ subs r3, r3, #1
+ sub lr, lr, #336
+ add lr, lr, #8
+
+ sub r4, r4, r5, lsl #4
+ add r4, r4, #8
+
+ bne filt_blk2d_sp16x16_outloop_neon
+
+ add sp, sp, #336
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_filter16x16_only
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #8 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (column-2)
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
+ vld1.u8 {d9, d10, d11}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+
+ vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q7, d7, d0
+ vmull.u8 q8, d9, d0
+ vmull.u8 q9, d10, d0
+
+ vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d21, d9, d10, #1
+ vext.8 d22, d7, d8, #1
+ vext.8 d23, d10, d11, #1
+ vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d25, d9, d10, #4
+ vext.8 d26, d7, d8, #4
+ vext.8 d27, d10, d11, #4
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d9, d10, #5
+
+ vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d21, d1
+ vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q9, d23, d1
+ vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d25, d4
+ vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q9, d27, d4
+ vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q8, d29, d5
+
+ vext.8 d20, d7, d8, #5
+ vext.8 d21, d10, d11, #5
+ vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d23, d9, d10, #2
+ vext.8 d24, d7, d8, #2
+ vext.8 d25, d10, d11, #2
+
+ vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d27, d9, d10, #3
+ vext.8 d28, d7, d8, #3
+ vext.8 d29, d10, d11, #3
+
+ vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q9, d21, d5
+ vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d23, d2
+ vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q9, d25, d2
+
+ vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q11, d27, d3
+ vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q15, d29, d3
+
+ vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q11
+ vqadd.s16 q7, q12
+ vqadd.s16 q9, q15
+
+ subs r2, r2, #1
+
+ vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q7, #7
+ vqrshrun.s16 d8, q8, #7
+ vqrshrun.s16 d9, q9, #7
+
+ vst1.u8 {q3}, [r4], r5 ;store result
+ vst1.u8 {q4}, [r4], r5
+
+ bne filt_blk2d_fpo16x16_loop_neon
+
+ pop {r4-r5,pc}
+
+;--------------------
+secondpass_filter16x16_only
+;Second pass: 16x16
+ add r3, r12, r3, lsl #5
+ sub r0, r0, r1, lsl #1
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ mov r3, #2 ;loop counter
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_spo16x16_outloop_neon
+ vld1.u8 {d18}, [r0], r1 ;load src data
+ vld1.u8 {d19}, [r0], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r0], r1
+ mov r12, #4 ;loop counter
+ vld1.u8 {d22}, [r0], r1
+
+secondpass_only_inner_loop_neon
+ vld1.u8 {d23}, [r0], r1 ;load src data
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r12, r12, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q9, q11
+ vst1.u8 {d7}, [r4], r5
+ vmov q10, q12
+ vst1.u8 {d8}, [r4], r5
+ vmov d22, d26
+ vst1.u8 {d9}, [r4], r5
+
+ bne secondpass_only_inner_loop_neon
+
+ subs r3, r3, #1
+ sub r0, r0, r1, lsl #4
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1
+ add r0, r0, #8
+
+ sub r4, r4, r5, lsl #4
+ add r4, r4, #8
+
+ bne filt_blk2d_spo16x16_outloop_neon
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict4x4_neon.asm
@@ -1,0 +1,422 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_sixtap_predict_neon| PROC
+ push {r4, lr}
+
+ adr r12, filter4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter4x4_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter4x4_only
+
+ vabs.s32 q12, q14 ;get abs(filer_parameters)
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;go back 2 columns of src data
+ sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
+
+;First pass: output_height lines x output_width columns (9x4)
+ vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmull.u8 q8, d20, d5
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
+ vmlal.u8 q8, d10, d0
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d20, d1
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d10, d4
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d20, d2
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q10, d10, d3
+
+ vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
+ vld1.u8 {q4}, [r0], r1
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+
+ vld1.u8 {q5}, [r0], r1
+ vld1.u8 {q6}, [r0], r1
+
+ vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d28, q8, #7
+
+ ;First Pass on rest 5-line data
+ vld1.u8 {q11}, [r0], r1
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmull.u8 q8, d20, d5
+ vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5])
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
+ vmlal.u8 q8, d10, d0
+ vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
+
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d20, d1
+ vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1])
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
+
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d10, d4
+ vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4])
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
+
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d20, d2
+ vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2])
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q10, d10, d3
+ vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3])
+
+ add r3, r12, r3, lsl #5
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+ vqadd.s16 q12, q11
+
+ vext.8 d23, d27, d28, #4
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+
+ vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d30, q8, #7
+ vqrshrun.s16 d31, q12, #7
+
+;Second pass: 4x4
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vext.8 d24, d28, d29, #4
+ vext.8 d25, d29, d30, #4
+ vext.8 d26, d30, d31, #4
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+ vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d28, d0
+
+ vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmull.u8 q6, d26, d5
+
+ vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d30, d4
+
+ vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q6, d24, d1
+
+ vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d29, d2
+
+ vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmlal.u8 q6, d25, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q6, q4
+
+ vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d4, q6, #7
+
+ vst1.32 {d3[0]}, [r4] ;store result
+ vst1.32 {d3[1]}, [r0]
+ vst1.32 {d4[0]}, [r1]
+ vst1.32 {d4[1]}, [r2]
+
+ pop {r4, pc}
+
+
+;---------------------
+firstpass_filter4x4_only
+ vabs.s32 q12, q14 ;get abs(filer_parameters)
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;go back 2 columns of src data
+
+;First pass: output_height lines x output_width columns (4x4)
+ vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmull.u8 q8, d20, d5
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
+ vmlal.u8 q8, d10, d0
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d20, d1
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d10, d4
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d20, d2
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q10, d10, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+
+ vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d28, q8, #7
+
+ vst1.32 {d27[0]}, [r4] ;store result
+ vst1.32 {d27[1]}, [r0]
+ vst1.32 {d28[0]}, [r1]
+ vst1.32 {d28[1]}, [r2]
+
+ pop {r4, pc}
+
+
+;---------------------
+secondpass_filter4x4_only
+ sub r0, r0, r1, lsl #1
+ add r3, r12, r3, lsl #5
+
+ vld1.32 {d27[0]}, [r0], r1 ;load src data
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.32 {d27[1]}, [r0], r1
+ vabs.s32 q7, q5
+ vld1.32 {d28[0]}, [r0], r1
+ vabs.s32 q8, q6
+ vld1.32 {d28[1]}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.32 {d29[0]}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.32 {d29[1]}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.32 {d30[0]}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.32 {d30[1]}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.32 {d31[0]}, [r0], r1
+ vdup.8 d5, d16[4]
+
+ vext.8 d23, d27, d28, #4
+ vext.8 d24, d28, d29, #4
+ vext.8 d25, d29, d30, #4
+ vext.8 d26, d30, d31, #4
+
+ vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d28, d0
+
+ vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmull.u8 q6, d26, d5
+
+ vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d30, d4
+
+ vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q6, d24, d1
+
+ vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d29, d2
+
+ vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmlal.u8 q6, d25, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q6, q4
+
+ vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d4, q6, #7
+
+ vst1.32 {d3[0]}, [r4] ;store result
+ vst1.32 {d3[1]}, [r0]
+ vst1.32 {d4[0]}, [r1]
+ vst1.32 {d4[1]}, [r2]
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict8x4_neon.asm
@@ -1,0 +1,473 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_sixtap_predict8x4_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter8_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter8x4_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter8x4_only
+
+ sub sp, sp, #32 ;reserve space on stack for temporary storage
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ mov lr, sp
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+
+;First pass: output_height lines x output_width columns (9x8)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vdup.8 d3, d25[4]
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d4, d26[0]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d5, d26[4]
+ vld1.u8 {q6}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {d22}, [lr]! ;store result
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {d23}, [lr]!
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {d24}, [lr]!
+ vld1.u8 {q7}, [r0], r1
+ vst1.u8 {d25}, [lr]!
+
+ ;first_pass filtering on the rest 5-line data
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+ vmull.u8 q11, d12, d0
+ vmull.u8 q12, d14, d0
+
+ vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d28, d8, d9, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d12, d13, #1
+ vext.8 d31, d14, d15, #1
+
+ vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q9, d28, d1
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q11, d30, d1
+ vmlsl.u8 q12, d31, d1
+
+ vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d28, d8, d9, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d12, d13, #4
+ vext.8 d31, d14, d15, #4
+
+ vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q9, d28, d4
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q11, d30, d4
+ vmlsl.u8 q12, d31, d4
+
+ vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d28, d8, d9, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d12, d13, #2
+ vext.8 d31, d14, d15, #2
+
+ vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q9, d28, d2
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q11, d30, d2
+ vmlal.u8 q12, d31, d2
+
+ vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d28, d8, d9, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d12, d13, #5
+ vext.8 d31, d14, d15, #5
+
+ vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q9, d28, d5
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q11, d30, d5
+ vmlal.u8 q12, d31, d5
+
+ vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d28, d8, d9, #3
+ vext.8 d29, d10, d11, #3
+ vext.8 d30, d12, d13, #3
+ vext.8 d31, d14, d15, #3
+
+ vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d28, d3
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+ vmull.u8 q7, d31, d3
+
+ vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q9, q4
+ vqadd.s16 q10, q5
+ vqadd.s16 q11, q6
+ vqadd.s16 q12, q7
+
+ vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d27, q9, #7
+ vqrshrun.s16 d28, q10, #7
+ vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
+ vqrshrun.s16 d30, q12, #7
+
+;Second pass: 8x4
+;secondpass_filter
+ add r3, r12, r3, lsl #5
+ sub lr, lr, #32
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.u8 {q11}, [lr]!
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vld1.u8 {q12}, [lr]!
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+ vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d23, d0
+ vmull.u8 q5, d24, d0
+ vmull.u8 q6, d25, d0
+
+ vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d24, d1
+ vmlsl.u8 q5, d25, d1
+ vmlsl.u8 q6, d26, d1
+
+ vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d27, d4
+ vmlsl.u8 q5, d28, d4
+ vmlsl.u8 q6, d29, d4
+
+ vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d25, d2
+ vmlal.u8 q5, d26, d2
+ vmlal.u8 q6, d27, d2
+
+ vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d28, d5
+ vmlal.u8 q5, d29, d5
+ vmlal.u8 q6, d30, d5
+
+ vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d26, d3
+ vmull.u8 q9, d27, d3
+ vmull.u8 q10, d28, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vst1.u8 {d7}, [r4], r5
+ vst1.u8 {d8}, [r4], r5
+ vst1.u8 {d9}, [r4], r5
+
+ add sp, sp, #32
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_filter8x4_only
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First pass: output_height lines x output_width columns (4x8)
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [r4], r5 ;store result
+ vst1.u8 {d23}, [r4], r5
+ vst1.u8 {d24}, [r4], r5
+ vst1.u8 {d25}, [r4], r5
+
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x4_only
+;Second pass: 8x4
+ add r3, r12, r3, lsl #5
+ sub r0, r0, r1, lsl #1
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vld1.u8 {d22}, [r0], r1
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.u8 {d25}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.u8 {d26}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.u8 {d27}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.u8 {d28}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.u8 {d29}, [r0], r1
+ vdup.8 d5, d16[4]
+ vld1.u8 {d30}, [r0], r1
+
+ vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d23, d0
+ vmull.u8 q5, d24, d0
+ vmull.u8 q6, d25, d0
+
+ vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d24, d1
+ vmlsl.u8 q5, d25, d1
+ vmlsl.u8 q6, d26, d1
+
+ vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d27, d4
+ vmlsl.u8 q5, d28, d4
+ vmlsl.u8 q6, d29, d4
+
+ vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d25, d2
+ vmlal.u8 q5, d26, d2
+ vmlal.u8 q6, d27, d2
+
+ vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d28, d5
+ vmlal.u8 q5, d29, d5
+ vmlal.u8 q6, d30, d5
+
+ vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d26, d3
+ vmull.u8 q9, d27, d3
+ vmull.u8 q10, d28, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vst1.u8 {d7}, [r4], r5
+ vst1.u8 {d8}, [r4], r5
+ vst1.u8 {d9}, [r4], r5
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+ END
--- /dev/null
+++ b/vp9/common/arm/neon/sixtappredict8x8_neon.asm
@@ -1,0 +1,524 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_sixtap_predict8x8_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter8_coeff
+
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter8x8_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter8x8_only
+
+ sub sp, sp, #64 ;reserve space on stack for temporary storage
+ mov lr, sp
+
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #2 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+
+;First pass: output_height lines x output_width columns (13x8)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vdup.8 d3, d25[4]
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d4, d26[0]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d5, d26[4]
+ vld1.u8 {q6}, [r0], r1
+
+filt_blk2d_fp8x8_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ subs r2, r2, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [lr]! ;store result
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {d23}, [lr]!
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {d24}, [lr]!
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {d25}, [lr]!
+
+ bne filt_blk2d_fp8x8_loop_neon
+
+ ;first_pass filtering on the rest 5-line data
+ ;vld1.u8 {q3}, [r0], r1 ;load src data
+ ;vld1.u8 {q4}, [r0], r1
+ ;vld1.u8 {q5}, [r0], r1
+ ;vld1.u8 {q6}, [r0], r1
+ vld1.u8 {q7}, [r0], r1
+
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+ vmull.u8 q11, d12, d0
+ vmull.u8 q12, d14, d0
+
+ vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d28, d8, d9, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d12, d13, #1
+ vext.8 d31, d14, d15, #1
+
+ vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q9, d28, d1
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q11, d30, d1
+ vmlsl.u8 q12, d31, d1
+
+ vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d28, d8, d9, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d12, d13, #4
+ vext.8 d31, d14, d15, #4
+
+ vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q9, d28, d4
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q11, d30, d4
+ vmlsl.u8 q12, d31, d4
+
+ vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d28, d8, d9, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d12, d13, #2
+ vext.8 d31, d14, d15, #2
+
+ vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q9, d28, d2
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q11, d30, d2
+ vmlal.u8 q12, d31, d2
+
+ vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d28, d8, d9, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d12, d13, #5
+ vext.8 d31, d14, d15, #5
+
+ vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q9, d28, d5
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q11, d30, d5
+ vmlal.u8 q12, d31, d5
+
+ vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d28, d8, d9, #3
+ vext.8 d29, d10, d11, #3
+ vext.8 d30, d12, d13, #3
+ vext.8 d31, d14, d15, #3
+
+ vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d28, d3
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+ vmull.u8 q7, d31, d3
+
+ vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q9, q4
+ vqadd.s16 q10, q5
+ vqadd.s16 q11, q6
+ vqadd.s16 q12, q7
+
+ add r3, r12, r3, lsl #5
+
+ vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
+ sub lr, lr, #64
+ vqrshrun.s16 d27, q9, #7
+ vld1.u8 {q9}, [lr]! ;load intermediate data from stack
+ vqrshrun.s16 d28, q10, #7
+ vld1.u8 {q10}, [lr]!
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+
+ vqrshrun.s16 d29, q11, #7
+ vld1.u8 {q11}, [lr]!
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vqrshrun.s16 d30, q12, #7
+ vld1.u8 {q12}, [lr]!
+
+;Second pass: 8x8
+ mov r3, #2 ;loop counter
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_sp8x8_loop_neon
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r3, r3, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vmov q9, q11
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q10, q12
+ vst1.u8 {d7}, [r4], r5
+ vmov q11, q13
+ vst1.u8 {d8}, [r4], r5
+ vmov q12, q14
+ vst1.u8 {d9}, [r4], r5
+ vmov d26, d30
+
+ bne filt_blk2d_sp8x8_loop_neon
+
+ add sp, sp, #64
+ pop {r4-r5,pc}
+
+;---------------------
+firstpass_filter8x8_only
+ ;add r2, r12, r2, lsl #5 ;calculate filter location
+ ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #2 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First pass: output_height lines x output_width columns (8x8)
+filt_blk2d_fpo8x8_loop_neon
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vld1.u8 {q4}, [r0], r1
+ vld1.u8 {q5}, [r0], r1
+ vld1.u8 {q6}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+ ;
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ subs r2, r2, #1
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [r4], r5 ;store result
+ vst1.u8 {d23}, [r4], r5
+ vst1.u8 {d24}, [r4], r5
+ vst1.u8 {d25}, [r4], r5
+
+ bne filt_blk2d_fpo8x8_loop_neon
+
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x8_only
+ sub r0, r0, r1, lsl #1
+ add r3, r12, r3, lsl #5
+
+ vld1.u8 {d18}, [r0], r1 ;load src data
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.u8 {d19}, [r0], r1
+ vabs.s32 q7, q5
+ vld1.u8 {d20}, [r0], r1
+ vabs.s32 q8, q6
+ vld1.u8 {d21}, [r0], r1
+ mov r3, #2 ;loop counter
+ vld1.u8 {d22}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.u8 {d23}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.u8 {d24}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.u8 {d25}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.u8 {d26}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.u8 {d27}, [r0], r1
+ vdup.8 d5, d16[4]
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+;Second pass: 8x8
+filt_blk2d_spo8x8_loop_neon
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r3, r3, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vmov q9, q11
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q10, q12
+ vst1.u8 {d7}, [r4], r5
+ vmov q11, q13
+ vst1.u8 {d8}, [r4], r5
+ vmov q12, q14
+ vst1.u8 {d9}, [r4], r5
+ vmov d26, d30
+
+ bne filt_blk2d_spo8x8_loop_neon
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+ END
--- /dev/null
+++ b/vp9/common/arm/recon_arm.h
@@ -1,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef RECON_ARM_H
+#define RECON_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_recon_block(vp9_recon_b_armv6);
+extern prototype_recon_block(vp9_recon2b_armv6);
+extern prototype_recon_block(vp9_recon4b_armv6);
+
+extern prototype_copy_block(vp9_copy_mem8x8_v6);
+extern prototype_copy_block(vp9_copy_mem8x4_v6);
+extern prototype_copy_block(vp9_copy_mem16x16_v6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_recon_recon
+#define vp8_recon_recon vp9_recon_b_armv6
+
+#undef vp8_recon_recon2
+#define vp8_recon_recon2 vp9_recon2b_armv6
+
+#undef vp8_recon_recon4
+#define vp8_recon_recon4 vp9_recon4b_armv6
+
+#undef vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
+
+#undef vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
+
+#undef vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_recon_block(vp9_recon_b_neon);
+extern prototype_recon_block(vp9_recon2b_neon);
+extern prototype_recon_block(vp9_recon4b_neon);
+
+extern prototype_copy_block(vp9_copy_mem8x8_neon);
+extern prototype_copy_block(vp9_copy_mem8x4_neon);
+extern prototype_copy_block(vp9_copy_mem16x16_neon);
+
+extern prototype_recon_macroblock(vp9_recon_mb_neon);
+
+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_recon_recon
+#define vp8_recon_recon vp9_recon_b_neon
+
+#undef vp8_recon_recon2
+#define vp8_recon_recon2 vp9_recon2b_neon
+
+#undef vp8_recon_recon4
+#define vp8_recon_recon4 vp9_recon4b_neon
+
+#undef vp8_recon_copy8x8
+#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
+
+#undef vp8_recon_copy8x4
+#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
+
+#undef vp8_recon_copy16x16
+#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
+
+#undef vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp9_recon_mb_neon
+
+#undef vp9_recon_build_intra_predictors_mby
+#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
+
+#undef vp9_recon_build_intra_predictors_mby_s
+#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
+
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/reconintra_arm.c
@@ -1,0 +1,62 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/common/blockd.h"
+#include "vp9/common/reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/recon.h"
+
+#if HAVE_ARMV7
+extern void vp9_build_intra_predictors_mby_neon_func(
+ unsigned char *y_buffer,
+ unsigned char *ypred_ptr,
+ int y_stride,
+ int mode,
+ int Up,
+ int Left);
+
+void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
+ unsigned char *y_buffer = xd->dst.y_buffer;
+ unsigned char *ypred_ptr = xd->predictor;
+ int y_stride = xd->dst.y_stride;
+ int mode = xd->mode_info_context->mbmi.mode;
+ int Up = xd->up_available;
+ int Left = xd->left_available;
+
+ vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
+ y_stride, mode, Up, Left);
+}
+#endif
+
+
+#if HAVE_ARMV7
+extern void vp9_build_intra_predictors_mby_s_neon_func(
+ unsigned char *y_buffer,
+ unsigned char *ypred_ptr,
+ int y_stride,
+ int mode,
+ int Up,
+ int Left);
+
+void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
+ unsigned char *y_buffer = xd->dst.y_buffer;
+ unsigned char *ypred_ptr = xd->predictor;
+ int y_stride = xd->dst.y_stride;
+ int mode = xd->mode_info_context->mbmi.mode;
+ int Up = xd->up_available;
+ int Left = xd->left_available;
+
+ vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
+ y_stride, mode, Up, Left);
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/arm/subpixel_arm.h
@@ -1,0 +1,89 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_ARM_H
+#define SUBPIXEL_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
+extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
+
+#undef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
+
+#undef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
+
+#undef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
+
+#undef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
+
+#undef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
+
+#undef vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
+
+#undef vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
+#endif
+#endif
+
+#if HAVE_ARMV7
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
+extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
+
+#undef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
+
+#undef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
+
+#undef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
+
+#undef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
+
+#undef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
+
+#undef vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
+
+#undef vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/asm_com_offsets.c
@@ -1,0 +1,40 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_ARMV7
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
--- /dev/null
+++ b/vp9/common/blockd.c
@@ -1,0 +1,29 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+const unsigned char vp9_block2left[25] = {
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp9_block2above[25] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
+
+const unsigned char vp9_block2left_8x8[25] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
+};
+const unsigned char vp9_block2above_8x8[25] = {
+ 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
+};
+
--- /dev/null
+++ b/vp9/common/blockd.h
@@ -1,0 +1,518 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCKD_H
+#define __INC_BLOCKD_H
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "subpixel.h"
+#include "vpx_ports/mem.h"
+#include "common.h"
+
+#define TRUE 1
+#define FALSE 0
+
+// #define MODE_STATS
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS 3
+#define PREDICTION_PROBS 3
+
+#define MBSKIP_CONTEXTS 3
+
+#define MAX_MB_SEGMENTS 4
+
+#define MAX_REF_LF_DELTAS 4
+#define MAX_MODE_LF_DELTAS 4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+#define MAX_MV_REFS 19
+#endif
+
+typedef struct {
+ int r, c;
+} POS;
+
+typedef enum PlaneType {
+ PLANE_TYPE_Y_NO_DC = 0,
+ PLANE_TYPE_Y2,
+ PLANE_TYPE_UV,
+ PLANE_TYPE_Y_WITH_DC,
+} PLANE_TYPE;
+
+typedef char ENTROPY_CONTEXT;
+typedef struct {
+ ENTROPY_CONTEXT y1[4];
+ ENTROPY_CONTEXT u[2];
+ ENTROPY_CONTEXT v[2];
+ ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp9_block2left[25];
+extern const unsigned char vp9_block2above[25];
+extern const unsigned char vp9_block2left_8x8[25];
+extern const unsigned char vp9_block2above_8x8[25];
+
+#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+ Dest = ((A)!=0) + ((B)!=0);
+
+typedef enum {
+ KEY_FRAME = 0,
+ INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+ SIXTAP = 0,
+ BILINEAR = 1,
+ EIGHTTAP = 2,
+ EIGHTTAP_SHARP = 3,
+ SWITCHABLE /* should be the last one */
+} INTERPOLATIONFILTERTYPE;
+
+typedef enum
+{
+ DC_PRED, /* average of above and left pixels */
+ V_PRED, /* vertical prediction */
+ H_PRED, /* horizontal prediction */
+ D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */
+ D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
+ D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
+ D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
+ D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */
+ D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */
+ TM_PRED, /* Truemotion prediction */
+ I8X8_PRED, /* 8x8 based prediction, each 8x8 has its own prediction mode */
+ B_PRED, /* block based prediction, each block has its own prediction mode */
+
+ NEARESTMV,
+ NEARMV,
+ ZEROMV,
+ NEWMV,
+ SPLITMV,
+
+ MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+// Segment level features.
+typedef enum {
+ SEG_LVL_ALT_Q = 0, // Use alternate Quantizer ....
+ SEG_LVL_ALT_LF = 1, // Use alternate loop filter value...
+ SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame
+ SEG_LVL_MODE = 3, // Optional Segment mode
+ SEG_LVL_EOB = 4, // EOB end stop marker.
+ SEG_LVL_TRANSFORM = 5, // Block transform size.
+ SEG_LVL_MAX = 6 // Number of MB level features supported
+
+} SEG_LVL_FEATURES;
+
+// Segment level features.
+typedef enum {
+ TX_4X4, // 4x4 dct transform
+ TX_8X8, // 8x8 dct transform
+ TX_16X16, // 16x16 dct transform
+ TX_SIZE_MAX // Number of different transforms available
+} TX_SIZE;
+
+typedef enum {
+ DCT_DCT = 0, // DCT in both horizontal and vertical
+ ADST_DCT = 1, // ADST in vertical, DCT in horizontal
+ DCT_ADST = 2, // DCT in vertical, ADST in horizontal
+ ADST_ADST = 3 // ADST in both directions
+} TX_TYPE;
+
+#define VP9_YMODES (B_PRED + 1)
+#define VP9_UV_MODES (TM_PRED + 1)
+#define VP9_I8X8_MODES (TM_PRED + 1)
+#define VP9_I32X32_MODES (TM_PRED + 1)
+
+#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum {
+ B_DC_PRED, /* average of above and left pixels */
+ B_TM_PRED,
+
+ B_VE_PRED, /* vertical prediction */
+ B_HE_PRED, /* horizontal prediction */
+
+ B_LD_PRED,
+ B_RD_PRED,
+
+ B_VR_PRED,
+ B_VL_PRED,
+ B_HD_PRED,
+ B_HU_PRED,
+
+ LEFT4X4,
+ ABOVE4X4,
+ ZERO4X4,
+ NEW4X4,
+
+ B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP9_BINTRAMODES (B_HU_PRED + 1) /* 10 */
+#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+typedef enum {
+ PARTITIONING_16X8 = 0,
+ PARTITIONING_8X16,
+ PARTITIONING_8X8,
+ PARTITIONING_4X4,
+ NB_PARTITIONINGS,
+} SPLITMV_PARTITIONING_TYPE;
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+union b_mode_info {
+ struct {
+ B_PREDICTION_MODE first;
+ TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE second;
+#endif
+ } as_mode;
+ struct {
+ int_mv first;
+ int_mv second;
+ } as_mv;
+};
+
+typedef enum {
+ INTRA_FRAME = 0,
+ LAST_FRAME = 1,
+ GOLDEN_FRAME = 2,
+ ALTREF_FRAME = 3,
+ MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct {
+ MB_PREDICTION_MODE mode, uv_mode;
+#if CONFIG_COMP_INTRA_PRED
+ MB_PREDICTION_MODE second_mode, second_uv_mode;
+#endif
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ TX_SIZE txfm_size;
+ int_mv mv[2]; // for each reference frame used
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+ int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+
+ SPLITMV_PARTITIONING_TYPE partitioning;
+ unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+ unsigned char need_to_clamp_mvs;
+ unsigned char need_to_clamp_secondmv;
+ unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */
+
+ // Flags used for prediction status of various bistream signals
+ unsigned char seg_id_predicted;
+ unsigned char ref_predicted;
+
+ // Indicates if the mb is part of the image (1) vs border (0)
+ // This can be useful in determining whether the MB provides
+ // a valid predictor
+ unsigned char mb_in_image;
+
+#if CONFIG_PRED_FILTER
+ // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
+ unsigned int pred_filter_enabled;
+#endif
+ INTERPOLATIONFILTERTYPE interp_filter;
+
+#if CONFIG_SUPERBLOCKS
+ // FIXME need a SB array of 4 MB_MODE_INFOs that
+ // only needs one encoded_as_sb.
+ unsigned char encoded_as_sb;
+#endif
+} MB_MODE_INFO;
+
+typedef struct {
+ MB_MODE_INFO mbmi;
+ union b_mode_info bmi[16];
+} MODE_INFO;
+
+typedef struct blockd {
+ short *qcoeff;
+ short *dqcoeff;
+ unsigned char *predictor;
+ short *diff;
+ short *dequant;
+
+ /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+ unsigned char **base_pre;
+ unsigned char **base_second_pre;
+ int pre;
+ int pre_stride;
+
+ unsigned char **base_dst;
+ int dst;
+ int dst_stride;
+
+ int eob;
+
+ union b_mode_info bmi;
+} BLOCKD;
+
+typedef struct macroblockd {
+ DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */
+ DECLARE_ALIGNED(16, unsigned char, predictor[384]);
+ DECLARE_ALIGNED(16, short, qcoeff[400]);
+ DECLARE_ALIGNED(16, short, dqcoeff[400]);
+ DECLARE_ALIGNED(16, char, eobs[25]);
+
+ /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+ BLOCKD block[25];
+ int fullpixel_mask;
+
+ YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+ struct {
+ uint8_t *y_buffer, *u_buffer, *v_buffer;
+ } second_pre;
+ YV12_BUFFER_CONFIG dst;
+
+ MODE_INFO *prev_mode_info_context;
+ MODE_INFO *mode_info_context;
+ int mode_info_stride;
+
+ FRAME_TYPE frame_type;
+
+ int up_available;
+ int left_available;
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context;
+ ENTROPY_CONTEXT_PLANES *left_context;
+
+ /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+ unsigned char segmentation_enabled;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+ unsigned char update_mb_segmentation_map;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char update_mb_segmentation_data;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char mb_segment_abs_delta;
+
+ /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+ /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+
+ // Probability Tree used to code Segment number
+ vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
+
+#if CONFIG_NEW_MVREF
+ vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
+#endif
+
+ // Segment features
+ signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
+ unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
+
+ /* mode_based Loop filter adjustment */
+ unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;
+
+ /* Delta values have the range +/- MAX_LOOP_FILTER */
+ signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+ signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+ /* Distance of MB away from frame edges */
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+ unsigned int frames_since_golden;
+ unsigned int frames_till_alt_ref_frame;
+ vp9_subpix_fn_t subpixel_predict;
+ vp9_subpix_fn_t subpixel_predict8x4;
+ vp9_subpix_fn_t subpixel_predict8x8;
+ vp9_subpix_fn_t subpixel_predict16x16;
+ vp9_subpix_fn_t subpixel_predict_avg;
+ vp9_subpix_fn_t subpixel_predict_avg8x4;
+ vp9_subpix_fn_t subpixel_predict_avg8x8;
+ vp9_subpix_fn_t subpixel_predict_avg16x16;
+ int allow_high_precision_mv;
+
+ int corrupted;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+ /* This is an intermediate buffer currently used in sub-pixel motion search
+ * to keep a copy of the reference area. This buffer can be used for other
+ * purpose.
+ */
+ DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ struct VP9_COMMON_RTCD *rtcd;
+#endif
+
+ int mb_index; // Index of the MB in the SB (0..3)
+ int q_index;
+
+} MACROBLOCKD;
+
+#define ACTIVE_HT 110 // quantization stepsize threshold
+
+#define ACTIVE_HT8 300
+
+#define ACTIVE_HT16 300
+
+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
+ B_PREDICTION_MODE b_mode;
+ switch (mode) {
+ case DC_PRED:
+ b_mode = B_DC_PRED;
+ break;
+ case V_PRED:
+ b_mode = B_VE_PRED;
+ break;
+ case H_PRED:
+ b_mode = B_HE_PRED;
+ break;
+ case TM_PRED:
+ b_mode = B_TM_PRED;
+ break;
+ case D45_PRED:
+ b_mode = B_LD_PRED;
+ break;
+ case D135_PRED:
+ b_mode = B_RD_PRED;
+ break;
+ case D117_PRED:
+ b_mode = B_VR_PRED;
+ break;
+ case D153_PRED:
+ b_mode = B_HD_PRED;
+ break;
+ case D27_PRED:
+ b_mode = B_HU_PRED;
+ break;
+ case D63_PRED:
+ b_mode = B_VL_PRED;
+ break;
+ default :
+ // for debug purpose, to be removed after full testing
+ assert(0);
+ break;
+ }
+ return b_mode;
+}
+
+// transform mapping
+static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
+ // map transform type
+ TX_TYPE tx_type;
+ switch (bmode) {
+ case B_TM_PRED :
+ case B_RD_PRED :
+ tx_type = ADST_ADST;
+ break;
+
+ case B_VE_PRED :
+ case B_VR_PRED :
+ tx_type = ADST_DCT;
+ break;
+
+ case B_HE_PRED :
+ case B_HD_PRED :
+ case B_HU_PRED :
+ tx_type = DCT_ADST;
+ break;
+
+ default :
+ tx_type = DCT_DCT;
+ break;
+ }
+ return tx_type;
+}
+
+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {
+ TX_TYPE tx_type = DCT_DCT;
+ if (xd->mode_info_context->mbmi.mode == B_PRED &&
+ xd->q_index < ACTIVE_HT) {
+ tx_type = txfm_map(b->bmi.as_mode.first);
+ }
+ return tx_type;
+}
+
+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {
+ TX_TYPE tx_type = DCT_DCT;
+ if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
+ xd->q_index < ACTIVE_HT8) {
+ tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));
+ }
+ return tx_type;
+}
+
+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {
+ TX_TYPE tx_type = DCT_DCT;
+ if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
+ xd->q_index < ACTIVE_HT16) {
+ tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
+ }
+ return tx_type;
+}
+
+static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {
+ TX_TYPE tx_type = DCT_DCT;
+ int ib = (b - xd->block);
+ if (ib >= 16)
+ return tx_type;
+ if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
+ tx_type = get_tx_type_16x16(xd, b);
+ }
+ if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+ ib = (ib & 8) + ((ib & 4) >> 1);
+ tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
+ }
+ if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
+ tx_type = get_tx_type_4x4(xd, b);
+ }
+ return tx_type;
+}
+
+extern void vp9_build_block_doffsets(MACROBLOCKD *xd);
+extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);
+
+static void update_blockd_bmi(MACROBLOCKD *xd) {
+ int i;
+ int is_4x4;
+ is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
+ (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||
+ (xd->mode_info_context->mbmi.mode == B_PRED);
+
+ if (is_4x4) {
+ for (i = 0; i < 16; i++) {
+ xd->block[i].bmi = xd->mode_info_context->bmi[i];
+ }
+ }
+}
+#endif /* __INC_BLOCKD_H */
--- /dev/null
+++ b/vp9/common/coefupdateprobs.h
@@ -1,0 +1,16 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* Update probabilities for the nodes in the token entropy tree.
+ Generated file included by entropy.c */
+#define COEF_UPDATE_PROB 252
+#define COEF_UPDATE_PROB_8X8 252
+#define COEF_UPDATE_PROB_16X16 252
--- /dev/null
+++ b/vp9/common/common.h
@@ -1,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef common_h
+#define common_h 1
+
+#include <assert.h>
+#include "vpx_config.h"
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "common_types.h"
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp9_copy( Dest, Src) { \
+ assert( sizeof( Dest) == sizeof( Src)); \
+ vpx_memcpy( Dest, Src, sizeof( Src)); \
+ }
+
+/* Use this for variably-sized arrays. */
+
+#define vp9_copy_array( Dest, Src, N) { \
+ assert( sizeof( *Dest) == sizeof( *Src)); \
+ vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+ }
+
+#define vp9_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest));
+
+#define vp9_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
+
+#endif /* common_h */
--- /dev/null
+++ b/vp9/common/common_types.h
@@ -1,0 +1,18 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_COMMON_TYPES
+#define __INC_COMMON_TYPES
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
--- /dev/null
+++ b/vp9/common/context.c
@@ -1,0 +1,397 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] = {
+ {
+ // Block Type ( 0 )
+ {
+ // Coeff Band ( 0 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ },
+ {
+ // Coeff Band ( 1 )
+ {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593},
+ {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987},
+ {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104},
+ },
+ {
+ // Coeff Band ( 2 )
+ {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0},
+ {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294},
+ {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879},
+ },
+ {
+ // Coeff Band ( 3 )
+ {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0},
+ {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302},
+ { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611},
+ },
+ {
+ // Coeff Band ( 4 )
+ {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0},
+ {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073},
+ { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50},
+ },
+ {
+ // Coeff Band ( 5 )
+ {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0},
+ {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362},
+ { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190},
+ },
+ {
+ // Coeff Band ( 6 )
+ {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0},
+ {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164},
+ { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8},
+ },
+ },
+ {
+ // Block Type ( 1 )
+ {
+ // Coeff Band ( 0 )
+ {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289},
+ {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914},
+ {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620},
+ },
+ {
+ // Coeff Band ( 1 )
+ {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0},
+ {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988},
+ {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136},
+ },
+ {
+ // Coeff Band ( 2 )
+ {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0},
+ {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980},
+ {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429},
+ },
+ {
+ // Coeff Band ( 3 )
+ {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0},
+ {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820},
+ {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679},
+ },
+ {
+ // Coeff Band ( 4 )
+ {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0},
+ {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127},
+ { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101},
+ },
+ {
+ // Coeff Band ( 5 )
+ {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0},
+ {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157},
+ { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198},
+ },
+ {
+ // Coeff Band ( 6 )
+ {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0},
+ {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195},
+ { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641},
+ { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30},
+ },
+ },
+ {
+ // Block Type ( 2 )
+ {
+ // Coeff Band ( 0 )
+ { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798},
+ {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837},
+ {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122},
+ },
+ {
+ // Coeff Band ( 1 )
+ {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0},
+ {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063},
+ {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047},
+ },
+ {
+ // Coeff Band ( 2 )
+ { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0},
+ { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404},
+ { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236},
+ },
+ {
+ // Coeff Band ( 3 )
+ { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157},
+ { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300},
+ },
+ {
+ // Coeff Band ( 4 )
+ { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427},
+ { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7},
+ },
+ {
+ // Coeff Band ( 5 )
+ { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652},
+ { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30},
+ },
+ {
+ // Coeff Band ( 6 )
+ { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517},
+ { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ },
+ },
+ {
+ // Block Type ( 3 )
+ {
+ // Coeff Band ( 0 )
+ {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694},
+ {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572},
+ {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284},
+ },
+ {
+ // Coeff Band ( 1 )
+ {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0},
+ {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280},
+ {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460},
+ },
+ {
+ // Coeff Band ( 2 )
+ {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0},
+ {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539},
+ {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138},
+ },
+ {
+ // Coeff Band ( 3 )
+ {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0},
+ {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181},
+ {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267},
+ },
+ {
+ // Coeff Band ( 4 )
+ {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0},
+ {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401},
+ {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268},
+ },
+ {
+ // Coeff Band ( 5 )
+ {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0},
+ {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811},
+ {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527},
+ },
+ {
+ // Coeff Band ( 6 )
+ {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0},
+ {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954},
+ {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459},
+ { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13},
+ },
+ },
+};
+
+// Update probabilities for the nodes in the token entropy tree.
+const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {
+ {
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+ {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+};
+#endif
--- /dev/null
+++ b/vp9/common/debugmodes.c
@@ -1,0 +1,146 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "blockd.h"
+
+void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
+ int frame) {
+ int mb_row;
+ int mb_col;
+ int mb_index = 0;
+ FILE *mvs = fopen("mvs.stt", "a");
+
+ /* print out the macroblock Y modes */
+ mb_index = 0;
+ fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++) {
+ for (mb_col = 0; mb_col < cols; mb_col++) {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ mb_index = 0;
+ fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++) {
+ for (mb_col = 0; mb_col < cols; mb_col++) {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock UV modes */
+ mb_index = 0;
+ fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++) {
+ for (mb_col = 0; mb_col < cols; mb_col++) {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ mb_index = 0;
+ fprintf(mvs, "Mbs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++) {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++) {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+
+ if (mi[mb_index].mbmi.mode == B_PRED) {
+ fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
+#if CONFIG_COMP_INTRA_PRED
+ fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
+#endif
+ } else
+ fprintf(mvs, "xx ");
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock mvs */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++) {
+ for (mb_col = 0; mb_col < cols; mb_col++) {
+ fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,
+ mi[mb_index].mbmi.mv[0].as_mv.col / 2);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++) {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++) {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+ fprintf(mvs, "%3d:%-3d ",
+ mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,
+ mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ fclose(mvs);
+}
--- /dev/null
+++ b/vp9/common/default_coef_probs.h
@@ -1,0 +1,1377 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp9_prob default_coef_probs [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] = {
+ {
+ /* Block Type ( 0 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+ { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+ { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+ { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+ { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+ { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+ { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 1 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+ { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+ { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+ { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+ { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+ { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+ { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+ { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 2 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+ { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+ { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+ { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 3 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+ { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+ { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+ { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ }
+};
+
+static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] = {
+ {
+ /* Block Type ( 0 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+ { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+ { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+ { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+ { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+ { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+ { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 1 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+ { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+ { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+ { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+ { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+ { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+ { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+ { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 2 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+ { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+ { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+ { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ },
+ {
+ /* Block Type ( 3 ) */
+ {
+ /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+ { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
+ },
+ {
+ /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+ { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+ { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
+ },
+ {
+ /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
+ },
+ {
+ /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ }
+ }
+};
+
+static const vp9_prob
+default_coef_probs_8x8[BLOCK_TYPES_8X8]
+[COEF_BANDS]
+[PREV_COEF_CONTEXTS]
+[ENTROPY_NODES] = {
+ {
+ /* block Type 0 */
+ {
+ /* Coeff Band 0 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+ { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+ { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+ { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+ { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+ { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+ { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+ { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+ }
+ },
+ {
+ /* block Type 1 */
+ {
+ /* Coeff Band 0 */
+ { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
+ { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
+ { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
+ { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
+ { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
+ { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
+ { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
+ { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
+ { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
+ { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
+ { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
+ { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ }
+ },
+ {
+ /* block Type 2 */
+ {
+ /* Coeff Band 0 */
+ { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
+ { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
+ { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
+ { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
+ { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
+ { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
+ { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
+ { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
+ { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
+ { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
+ { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
+ { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
+ { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
+ { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
+ { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
+ { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
+ { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
+ { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
+ { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
+ { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
+ { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
+ { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
+ { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
+ { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
+ { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+ }
+ },
+ { /* block Type 3 */
+ { /* Coeff Band 0 */
+ { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
+ { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
+ { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
+ { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
+ { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
+ { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
+ { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
+ { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
+ { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
+ { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
+ { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
+ { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
+ { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
+ { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
+ { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
+ { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
+ { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
+ { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
+ { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
+ { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
+ { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
+ { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
+ { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ }
+ }
+};
+
+static const vp9_prob
+default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] = {
+ {
+ /* block Type 0 */
+ {
+ /* Coeff Band 0 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+ { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+ { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+ { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+ { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+ { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+ { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+ { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+ }
+ },
+ {
+ /* block Type 1 */
+ {
+ /* Coeff Band 0 */
+ { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
+ { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
+ { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
+ { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
+ { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
+ { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
+ { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
+ { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
+ { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
+ { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
+ { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
+ { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ }
+ },
+ {
+ /* block Type 2 */
+ {
+ /* Coeff Band 0 */
+ { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
+ { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
+ { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
+ { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 1 */
+ { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
+ { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
+ { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
+ { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 2 */
+ { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
+ { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
+ { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
+ { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 3 */
+ { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
+ { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
+ { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
+ { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 4 */
+ { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
+ { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
+ { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
+ { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
+ },
+ {
+ /* Coeff Band 5 */
+ { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
+ { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
+ { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
+ { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 6 */
+ { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
+ { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
+ { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
+ { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
+ },
+ {
+ /* Coeff Band 7 */
+ { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
+ { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
+ { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
+ { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+ }
+ },
+ { /* block Type 3 */
+ { /* Coeff Band 0 */
+ { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
+ { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
+ { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
+ { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
+ { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
+ { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
+ { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
+ { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
+ { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
+ { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
+ { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
+ { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
+ { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
+ { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
+ { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
+ { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
+ { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
+ { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
+ { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
+ { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
+ { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
+ { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
+ { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ }
+ }
+};
+
+static const vp9_prob
+ default_coef_probs_16x16[BLOCK_TYPES_16X16]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] = {
+ { /* block Type 0 */
+ { /* Coeff Band 0 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+ { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+ { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+ { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+ { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+ { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+ { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+ { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+ }
+ },
+ { /* block Type 1 */
+ { /* Coeff Band 0 */
+ { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+ { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+ { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+ { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+ { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+ { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+ { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+ { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+ { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+ { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+ { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+ { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+ { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+ { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+ { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+ { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+ { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+ { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+ { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+ { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+ { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+ { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+ { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+ { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+ }
+ },
+ { /* block Type 2 */
+ { /* Coeff Band 0 */
+ { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+ { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+ { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+ { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+ { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+ { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+ { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+ { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+ { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+ { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+ { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+ { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+ { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+ { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+ { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+ { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+ { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+ { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+ { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+ { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+ { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+ { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+ { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+ { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+ }
+ },
+ { /* block Type 3 */
+ { /* Coeff Band 0 */
+ { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
+ { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
+ { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
+ { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
+ { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
+ { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
+ },
+ { /* Coeff Band 2 */
+ { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
+ { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
+ { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
+ { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
+ },
+ { /* Coeff Band 3 */
+ { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
+ { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
+ { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
+ { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
+ },
+ { /* Coeff Band 4 */
+ { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
+ { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
+ { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
+ { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
+ { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
+ { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
+ { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
+ { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
+ { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
+ { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
+ },
+ { /* Coeff Band 7 */
+ { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
+ { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
+ { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
+ { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+ }
+ }
+};
+
+static const vp9_prob
+ default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] = {
+ { /* block Type 0 */
+ { /* Coeff Band 0 */
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
+ { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
+ { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
+ { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
+ { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
+ { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
+ { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
+ { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
+ { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
+ { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
+ { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
+ { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
+ { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
+ { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+ }
+ },
+ { /* block Type 1 */
+ { /* Coeff Band 0 */
+ { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+ { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+ { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+ { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+ { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+ { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+ { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+ { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+ { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+ { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+ { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+ { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+ { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+ { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+ { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+ { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+ { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+ { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+ { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+ { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+ { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+ { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+ { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+ { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+ }
+ },
+ { /* block Type 2 */
+ { /* Coeff Band 0 */
+ { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
+ { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
+ { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
+ { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
+ { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
+ { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
+ },
+ { /* Coeff Band 2 */
+ { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
+ { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
+ { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
+ { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
+ },
+ { /* Coeff Band 3 */
+ { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
+ { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
+ { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
+ { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
+ },
+ { /* Coeff Band 4 */
+ { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
+ { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
+ { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
+ { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
+ { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
+ { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
+ { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
+ { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
+ { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
+ { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
+ },
+ { /* Coeff Band 7 */
+ { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
+ { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
+ { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
+ { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
+ }
+ },
+ { /* block Type 3 */
+ { /* Coeff Band 0 */
+ { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
+ { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
+ { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+ },
+ { /* Coeff Band 1 */
+ { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
+ { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
+ { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
+ { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
+ },
+ { /* Coeff Band 2 */
+ { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
+ { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
+ { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
+ { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
+ },
+ { /* Coeff Band 3 */
+ { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
+ { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
+ { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
+ { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
+ },
+ { /* Coeff Band 4 */
+ { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
+ { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
+ { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
+ { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
+ },
+ { /* Coeff Band 5 */
+ { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
+ { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
+ { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
+ { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
+ },
+ { /* Coeff Band 6 */
+ { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
+ { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
+ { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
+ { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
+ },
+ { /* Coeff Band 7 */
+ { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
+ { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
+ { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
+ { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+ }
+ }
+};
--- /dev/null
+++ b/vp9/common/entropy.c
@@ -1,0 +1,447 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+
+#include "entropy.h"
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define uchar unsigned char /* typedefs can clash */
+#define uint unsigned int
+
+typedef const uchar cuchar;
+typedef const uint cuint;
+
+typedef vp9_prob Prob;
+
+#include "coefupdateprobs.h"
+
+const int vp9_i8x8_block[4] = {0, 2, 8, 10};
+
+DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {
+ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
+};
+
+DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {
+ 0, 1, 4, 8,
+ 5, 2, 3, 6,
+ 9, 12, 13, 10,
+ 7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {
+ 0, 4, 8, 12,
+ 1, 5, 9, 13,
+ 2, 6, 10, 14,
+ 3, 7, 11, 15
+};
+DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15
+};
+
+
+DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
+ 5, 3, 6, 3, 5, 4, 6, 6,
+ 6, 5, 5, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7
+ };
+DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+// Table can be optimized.
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
+ 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+ 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5,
+ 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52,
+ 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, 9, 24, 39,
+ 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40,
+ 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177,
+ 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, 13, 28, 43, 58, 73,
+ 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,
+ 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120,
+ 135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,
+ 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, 92, 107, 122, 137, 152, 167,
+ 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,
+ 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,
+ 215, 200, 185, 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201,
+ 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,
+ 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
+};
+
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
+{
+ -DCT_EOB_TOKEN, 2, /* 0 = EOB */
+ -ZERO_TOKEN, 4, /* 1 = ZERO */
+ -ONE_TOKEN, 6, /* 2 = ONE */
+ 8, 12, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 10, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 14, 16, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 18, 20, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+
+struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+/* Trees for extra bits. Probabilities are constant and
+ do not depend on previously encoded bits */
+
+static const Prob Pcat1[] = { 159};
+static const Prob Pcat2[] = { 165, 145};
+static const Prob Pcat3[] = { 173, 148, 140};
+static const Prob Pcat4[] = { 176, 155, 140, 135};
+static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const Prob Pcat6[] =
+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+ int i = 0;
+
+ while (++i < n) {
+ p[0] = p[1] = i << 1;
+ p += 2;
+ }
+
+ p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+ init_bit_tree(cat1, 1);
+ init_bit_tree(cat2, 2);
+ init_bit_tree(cat3, 3);
+ init_bit_tree(cat4, 4);
+ init_bit_tree(cat5, 5);
+ init_bit_tree(cat6, 13);
+}
+
+vp9_extra_bit_struct vp9_extra_bits[12] = {
+ { 0, 0, 0, 0},
+ { 0, 0, 0, 1},
+ { 0, 0, 0, 2},
+ { 0, 0, 0, 3},
+ { 0, 0, 0, 4},
+ { cat1, Pcat1, 1, 5},
+ { cat2, Pcat2, 2, 7},
+ { cat3, Pcat3, 3, 11},
+ { cat4, Pcat4, 4, 19},
+ { cat5, Pcat5, 5, 35},
+ { cat6, Pcat6, 13, 67},
+ { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp9_default_coef_probs(VP9_COMMON *pc) {
+ vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+ sizeof(pc->fc.coef_probs));
+ vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
+ sizeof(pc->fc.hybrid_coef_probs));
+
+ vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
+ sizeof(pc->fc.coef_probs_8x8));
+ vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,
+ sizeof(pc->fc.hybrid_coef_probs_8x8));
+
+ vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
+ sizeof(pc->fc.coef_probs_16x16));
+ vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
+ default_hybrid_coef_probs_16x16,
+ sizeof(pc->fc.hybrid_coef_probs_16x16));
+}
+
+void vp9_coef_tree_initialize() {
+ init_bit_trees();
+ vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
+// #define COEF_COUNT_TESTING
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_KEY 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+ int t, i, j, k, count;
+ unsigned int branch_ct[ENTROPY_NODES][2];
+ vp9_prob coef_probs[ENTROPY_NODES];
+ int update_factor; /* denominator 256 */
+ int factor;
+ int count_sat;
+
+ // printf("Frame type: %d\n", cm->frame_type);
+ if (cm->frame_type == KEY_FRAME) {
+ update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+ count_sat = COEF_COUNT_SAT_KEY;
+ } else if (cm->last_frame_type == KEY_FRAME) {
+ update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
+ count_sat = COEF_COUNT_SAT_AFTER_KEY;
+ } else {
+ update_factor = COEF_MAX_UPDATE_FACTOR;
+ count_sat = COEF_COUNT_SAT;
+ }
+
+#ifdef COEF_COUNT_TESTING
+ {
+ printf("static const unsigned int\ncoef_counts"
+ "[BLOCK_TYPES] [COEF_BANDS]"
+ "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ printf(" {\n");
+ for (j = 0; j < COEF_BANDS; ++j) {
+ printf(" {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ printf(" {");
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ printf("%d, ", cm->fc.coef_counts[i][j][k][t]);
+ printf("},\n");
+ }
+ printf(" },\n");
+ }
+ printf(" },\n");
+ }
+ printf("};\n");
+ printf("static const unsigned int\ncoef_counts_8x8"
+ "[BLOCK_TYPES_8X8] [COEF_BANDS]"
+ "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+ for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+ printf(" {\n");
+ for (j = 0; j < COEF_BANDS; ++j) {
+ printf(" {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ printf(" {");
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);
+ printf("},\n");
+ }
+ printf(" },\n");
+ }
+ printf(" },\n");
+ }
+ printf("};\n");
+ printf("static const unsigned int\nhybrid_coef_counts"
+ "[BLOCK_TYPES] [COEF_BANDS]"
+ "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ printf(" {\n");
+ for (j = 0; j < COEF_BANDS; ++j) {
+ printf(" {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ printf(" {");
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);
+ printf("},\n");
+ }
+ printf(" },\n");
+ }
+ printf(" },\n");
+ }
+ printf("};\n");
+ }
+#endif
+
+ for (i = 0; i < BLOCK_TYPES; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],
+ 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;
+ else cm->fc.coef_probs[i][j][k][t] = prob;
+ }
+ }
+
+ for (i = 0; i < BLOCK_TYPES; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],
+ 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;
+ else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
+ }
+ }
+
+ for (i = 0; i < BLOCK_TYPES_8X8; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],
+ 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;
+ else cm->fc.coef_probs_8x8[i][j][k][t] = prob;
+ }
+ }
+
+ for (i = 0; i < BLOCK_TYPES_8X8; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],
+ 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *
+ (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;
+ else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
+ }
+ }
+
+ for (i = 0; i < BLOCK_TYPES_16X16; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *
+ (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;
+ else cm->fc.coef_probs_16x16[i][j][k][t] = prob;
+ }
+ }
+
+ for (i = 0; i < BLOCK_TYPES_16X16; ++i)
+ for (j = 0; j < COEF_BANDS; ++j)
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > count_sat ? count_sat : count;
+ factor = (update_factor * count / count_sat);
+ prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +
+ (int)coef_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;
+ else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;
+ else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
+ }
+ }
+}
--- /dev/null
+++ b/vp9/common/entropy.h
@@ -1,0 +1,112 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPY_H
+#define __INC_ENTROPY_H
+
+#include "treecoder.h"
+#include "blockd.h"
+#include "common.h"
+#include "coefupdateprobs.h"
+
+extern const int vp9_i8x8_block[4];
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
+#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
+#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
+#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
+#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */
+#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+#define EOSB_TOKEN 127 /* Not signalled, encoder only */
+
+extern const vp9_tree_index vp9_coef_tree[];
+
+extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct {
+ vp9_tree_p tree;
+ const vp9_prob *prob;
+ int Len;
+ int base_val;
+} vp9_extra_bit_struct;
+
+extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST 7
+
+#define MAX_PROB 255
+#define DCT_MAX_VALUE 8192
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+#define BLOCK_TYPES 4
+
+#define BLOCK_TYPES_8X8 4
+
+#define BLOCK_TYPES_16X16 4
+
+/* Middle dimension is a coarsening of the coefficient's
+ position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
+extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+ the extent to which nearby coefficients are nonzero. For the first
+ coefficient (DC, unless block type is 0), we look at the (already encoded)
+ blocks above and to the left of the current block. The context index is
+ then the number (0,1,or 2) of these blocks having nonzero coefficients.
+ After decoding a coefficient, the measure is roughly the size of the
+ most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+ Note that the intuitive meaning of this measure changes as coefficients
+ are decoded, e.g., prior to the first token, a zero means that my neighbors
+ are empty while, after the first token, because of the use of end-of-block,
+ a zero means we just decoded a zero and hence guarantees that a non-zero
+ coefficient will appear later in this block. However, this shift
+ in meaning is perfectly OK because our context depends also on the
+ coefficient band (and since zigzag positions 0, 1, and 2 are in
+ distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
+#define PREV_COEF_CONTEXTS 4
+
+#define SUBEXP_PARAM 4 /* Subexponential code parameter */
+#define MODULUS_PARAM 13 /* Modulus parameter */
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+struct VP9Common;
+void vp9_default_coef_probs(struct VP9Common *);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
+
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
+
+extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
+void vp9_coef_tree_initialize(void);
+
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+void vp9_adapt_coef_probs(struct VP9Common *);
+
+#endif
--- /dev/null
+++ b/vp9/common/entropymode.c
@@ -1,0 +1,614 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "modecont.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
+ /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */
+ {12, 6, 5, 5, 5, 5, 5, 5, 5, 2, 22, 200},
+ {25, 13, 13, 7, 7, 7, 7, 7, 7, 6, 27, 160},
+ {31, 17, 18, 8, 8, 8, 8, 8, 8, 9, 26, 139},
+ {40, 22, 23, 8, 8, 8, 8, 8, 8, 12, 27, 116},
+ {53, 26, 28, 8, 8, 8, 8, 8, 8, 13, 26, 94},
+ {68, 33, 35, 8, 8, 8, 8, 8, 8, 17, 20, 68},
+ {78, 38, 38, 8, 8, 8, 8, 8, 8, 19, 16, 52},
+ {89, 42, 42, 8, 8, 8, 8, 8, 8, 21, 12, 34},
+};
+
+static const unsigned int y_mode_cts [VP9_YMODES] = {
+ /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */
+ 98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
+};
+
+static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
+ /* DC V H D45 135 117 153 D27 D63 TM */
+ { 200, 15, 15, 10, 10, 10, 10, 10, 10, 6}, /* DC */
+ { 130, 75, 10, 10, 10, 10, 10, 10, 10, 6}, /* V */
+ { 130, 10, 75, 10, 10, 10, 10, 10, 10, 6}, /* H */
+ { 130, 15, 10, 75, 10, 10, 10, 10, 10, 6}, /* D45 */
+ { 150, 15, 10, 10, 75, 10, 10, 10, 10, 6}, /* D135 */
+ { 150, 15, 10, 10, 10, 75, 10, 10, 10, 6}, /* D117 */
+ { 150, 15, 10, 10, 10, 10, 75, 10, 10, 6}, /* D153 */
+ { 150, 15, 10, 10, 10, 10, 10, 75, 10, 6}, /* D27 */
+ { 150, 15, 10, 10, 10, 10, 10, 10, 75, 6}, /* D63 */
+ { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
+ { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
+ { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */
+};
+
+static const unsigned int i8x8_mode_cts [VP9_I8X8_MODES] = {
+ /* DC V H D45 135 117 153 D27 D63 TM */
+ 73, 49, 61, 30, 30, 30, 30, 30, 30, 13
+};
+
+static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
+ // DC V H D45 135 117 153 D27 D63 TM
+ { 160, 24, 24, 20, 20, 20, 20, 20, 20, 8}, /* DC */
+ { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */
+ { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */
+ { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */
+ { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */
+ { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */
+ { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */
+ { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
+ { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
+ { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
+ { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
+ { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */
+};
+
+static const unsigned int bmode_cts[VP9_BINTRAMODES] = {
+ /* DC TM VE HE LD RD VR VL HD HU */
+ 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
+};
+
+typedef enum {
+ SUBMVREF_NORMAL,
+ SUBMVREF_LEFT_ZED,
+ SUBMVREF_ABOVE_ZED,
+ SUBMVREF_LEFT_ABOVE_SAME,
+ SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+int vp9_mv_cont(const int_mv *l, const int_mv *a) {
+ int lez = (l->as_int == 0);
+ int aez = (a->as_int == 0);
+ int lea = (l->as_int == a->as_int);
+
+ if (lea && lez)
+ return SUBMVREF_LEFT_ABOVE_ZED;
+
+ if (lea)
+ return SUBMVREF_LEFT_ABOVE_SAME;
+
+ if (aez)
+ return SUBMVREF_ABOVE_ZED;
+
+ if (lez)
+ return SUBMVREF_LEFT_ZED;
+
+ return SUBMVREF_NORMAL;
+}
+
+const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};
+
+const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
+ { 147, 136, 18 },
+ { 106, 145, 1 },
+ { 179, 121, 1 },
+ { 223, 1, 34 },
+ { 208, 1, 1 }
+};
+
+vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 1, 1, 1, 1,
+ 1, 1, 1, 1,
+ }, {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ }, {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 2, 2, 3, 3,
+ }, {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15,
+ },
+};
+
+const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */
+{
+ -B_DC_PRED, 2, /* 0 = DC_NODE */
+ -B_TM_PRED, 4, /* 1 = TM_NODE */
+ -B_VE_PRED, 6, /* 2 = VE_NODE */
+ 8, 12, /* 3 = COM_NODE */
+ -B_HE_PRED, 10, /* 4 = HE_NODE */
+ -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
+ -B_LD_PRED, 14, /* 6 = LD_NODE */
+ -B_VL_PRED, 16, /* 7 = VL_NODE */
+ -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+ explicitly-programmed predecessors. */
+const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
+ 2, 14,
+ -DC_PRED, 4,
+ 6, 8,
+ -D45_PRED, -D135_PRED,
+ 10, 12,
+ -D117_PRED, -D153_PRED,
+ -D27_PRED, -D63_PRED,
+ 16, 18,
+ -V_PRED, -H_PRED,
+ -TM_PRED, 20,
+ -B_PRED, -I8X8_PRED
+};
+
+const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
+ 2, 14,
+ -DC_PRED, 4,
+ 6, 8,
+ -D45_PRED, -D135_PRED,
+ 10, 12,
+ -D117_PRED, -D153_PRED,
+ -D27_PRED, -D63_PRED,
+ 16, 18,
+ -V_PRED, -H_PRED,
+ -TM_PRED, 20,
+ -B_PRED, -I8X8_PRED
+};
+
+const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
+ 2, 14,
+ -DC_PRED, 4,
+ 6, 8,
+ -D45_PRED, -D135_PRED,
+ 10, 12,
+ -D117_PRED, -D153_PRED,
+ -D27_PRED, -D63_PRED,
+ -V_PRED, 16,
+ -H_PRED, -TM_PRED
+};
+
+const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
+ 2, 14,
+ -DC_PRED, 4,
+ 6, 8,
+ -D45_PRED, -D135_PRED,
+ 10, 12,
+ -D117_PRED, -D153_PRED,
+ -D27_PRED, -D63_PRED,
+ -V_PRED, 16,
+ -H_PRED, -TM_PRED
+};
+
+const vp9_tree_index vp9_mbsplit_tree[6] = {
+ -PARTITIONING_4X4, 2,
+ -PARTITIONING_8X8, 4,
+ -PARTITIONING_16X8, -PARTITIONING_8X16,
+};
+
+const vp9_tree_index vp9_mv_ref_tree[8] = {
+ -ZEROMV, 2,
+ -NEARESTMV, 4,
+ -NEARMV, 6,
+ -NEWMV, -SPLITMV
+};
+
+#if CONFIG_SUPERBLOCKS
+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
+ -ZEROMV, 2,
+ -NEARESTMV, 4,
+ -NEARMV, -NEWMV
+};
+#endif
+
+const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
+ -LEFT4X4, 2,
+ -ABOVE4X4, 4,
+ -ZERO4X4, -NEW4X4
+};
+
+struct vp9_token_struct vp9_bmode_encodings [VP9_BINTRAMODES];
+struct vp9_token_struct vp9_ymode_encodings [VP9_YMODES];
+#if CONFIG_SUPERBLOCKS
+struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];
+#endif
+struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];
+struct vp9_token_struct vp9_uv_mode_encodings [VP9_UV_MODES];
+struct vp9_token_struct vp9_i8x8_mode_encodings [VP9_I8X8_MODES];
+struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];
+
+struct vp9_token_struct vp9_mv_ref_encoding_array [VP9_MVREFS];
+#if CONFIG_SUPERBLOCKS
+struct vp9_token_struct vp9_sb_mv_ref_encoding_array [VP9_MVREFS];
+#endif
+struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];
+
+void vp9_init_mbmode_probs(VP9_COMMON *x) {
+ unsigned int bct [VP9_YMODES] [2]; /* num Ymodes > num UV modes */
+
+ vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
+ vp9_ymode_tree, x->fc.ymode_prob,
+ bct, y_mode_cts, 256, 1);
+ {
+ int i;
+ for (i = 0; i < 8; i++) {
+ vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
+ vp9_kf_ymode_tree, x->kf_ymode_prob[i],
+ bct, kf_y_mode_cts[i], 256, 1);
+#if CONFIG_SUPERBLOCKS
+ vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
+ vp9_sb_kf_ymode_encodings,
+ vp9_sb_ymode_tree,
+ x->sb_kf_ymode_prob[i], bct,
+ kf_y_mode_cts[i], 256, 1);
+#endif
+ }
+ }
+ {
+ int i;
+ for (i = 0; i < VP9_YMODES; i++) {
+ vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+ vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
+ bct, kf_uv_mode_cts[i], 256, 1);
+ vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+ vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
+ bct, uv_mode_cts[i], 256, 1);
+ }
+ }
+
+ vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+ vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
+ bct, i8x8_mode_cts, 256, 1);
+
+ vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
+ sizeof(vp9_sub_mv_ref_prob2));
+ vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
+ vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
+ sizeof(vp9_switchable_interp_prob));
+}
+
+
+static void intra_bmode_probs_from_distribution(
+ vp9_prob p [VP9_BINTRAMODES - 1],
+ unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],
+ const unsigned int events [VP9_BINTRAMODES]) {
+ vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
+ vp9_bmode_tree, p, branch_ct,
+ events, 256, 1);
+}
+
+void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {
+ unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];
+ intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
+}
+
+void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]
+ [VP9_BINTRAMODES - 1]) {
+ unsigned int branch_ct[VP9_BINTRAMODES - 1][2];
+ int i, j;
+
+ for (i = 0; i < VP9_BINTRAMODES; i++) {
+ for (j = 0; j < VP9_BINTRAMODES; j++) {
+ intra_bmode_probs_from_distribution(
+ p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);
+ }
+ }
+}
+
+#if VP9_SWITCHABLE_FILTERS == 3
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+ -0, 2,
+ -1, -2
+};
+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+ EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
+ [VP9_SWITCHABLE_FILTERS-1] = {
+ {248, 192}, { 32, 248}, { 32, 32}, {192, 160}
+};
+#elif VP9_SWITCHABLE_FILTERS == 2
+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
+ -0, -1,
+};
+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
+ [VP9_SWITCHABLE_FILTERS-1] = {
+ {248},
+ { 64},
+ {192},
+};
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+ EIGHTTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
+#endif
+
+void vp9_entropy_mode_init() {
+ vp9_tokens_from_tree(vp9_bmode_encodings, vp9_bmode_tree);
+ vp9_tokens_from_tree(vp9_ymode_encodings, vp9_ymode_tree);
+ vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
+#if CONFIG_SUPERBLOCKS
+ vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);
+#endif
+ vp9_tokens_from_tree(vp9_uv_mode_encodings, vp9_uv_mode_tree);
+ vp9_tokens_from_tree(vp9_i8x8_mode_encodings, vp9_i8x8_mode_tree);
+ vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
+ vp9_tokens_from_tree(vp9_switchable_interp_encodings,
+ vp9_switchable_interp_tree);
+
+ vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
+ vp9_mv_ref_tree, NEARESTMV);
+#if CONFIG_SUPERBLOCKS
+ vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
+ vp9_sb_mv_ref_tree, NEARESTMV);
+#endif
+ vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
+ vp9_sub_mv_ref_tree, LEFT4X4);
+}
+
+void vp9_init_mode_contexts(VP9_COMMON *pc) {
+ vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));
+ vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));
+
+ vpx_memcpy(pc->fc.mode_context,
+ vp9_default_mode_contexts,
+ sizeof(pc->fc.mode_context));
+ vpx_memcpy(pc->fc.mode_context_a,
+ vp9_default_mode_contexts_a,
+ sizeof(pc->fc.mode_context_a));
+
+}
+
+void vp9_accum_mv_refs(VP9_COMMON *pc,
+ MB_PREDICTION_MODE m,
+ const int ct[4]) {
+ int (*mv_ref_ct)[4][2];
+
+ if (pc->refresh_alt_ref_frame)
+ mv_ref_ct = pc->fc.mv_ref_ct_a;
+ else
+ mv_ref_ct = pc->fc.mv_ref_ct;
+
+ if (m == ZEROMV) {
+ ++mv_ref_ct [ct[0]] [0] [0];
+ } else {
+ ++mv_ref_ct [ct[0]] [0] [1];
+ if (m == NEARESTMV) {
+ ++mv_ref_ct [ct[1]] [1] [0];
+ } else {
+ ++mv_ref_ct [ct[1]] [1] [1];
+ if (m == NEARMV) {
+ ++mv_ref_ct [ct[2]] [2] [0];
+ } else {
+ ++mv_ref_ct [ct[2]] [2] [1];
+ if (m == NEWMV) {
+ ++mv_ref_ct [ct[3]] [3] [0];
+ } else {
+ ++mv_ref_ct [ct[3]] [3] [1];
+ }
+ }
+ }
+ }
+}
+
+#define MVREF_COUNT_SAT 20
+#define MVREF_MAX_UPDATE_FACTOR 144
+void vp9_update_mode_context(VP9_COMMON *pc) {
+ int i, j;
+ int (*mv_ref_ct)[4][2];
+ int (*mode_context)[4];
+
+ if (pc->refresh_alt_ref_frame) {
+ mv_ref_ct = pc->fc.mv_ref_ct_a;
+ mode_context = pc->fc.mode_context_a;
+ } else {
+ mv_ref_ct = pc->fc.mv_ref_ct;
+ mode_context = pc->fc.mode_context;
+ }
+
+ for (j = 0; j < 6; j++) {
+ for (i = 0; i < 4; i++) {
+ int this_prob;
+ int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+ int factor;
+ {
+ this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
+ count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
+ factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
+ this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +
+ this_prob * factor + 128) >> 8;
+ this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;
+ mode_context[j][i] = this_prob;
+ }
+ }
+ }
+}
+
+#ifdef MODE_STATS
+#include "vp9/common/modecont.h"
+void print_mode_contexts(VP9_COMMON *pc) {
+ int j, i;
+ printf("\n====================\n");
+ for (j = 0; j < 6; j++) {
+ for (i = 0; i < 4; i++) {
+ printf("%4d ", pc->fc.mode_context[j][i]);
+ }
+ printf("\n");
+ }
+ printf("====================\n");
+ for (j = 0; j < 6; j++) {
+ for (i = 0; i < 4; i++) {
+ printf("%4d ", pc->fc.mode_context_a[j][i]);
+ }
+ printf("\n");
+ }
+}
+#endif
+
+// #define MODE_COUNT_TESTING
+#define MODE_COUNT_SAT 20
+#define MODE_MAX_UPDATE_FACTOR 144
+void vp9_adapt_mode_probs(VP9_COMMON *cm) {
+ int i, t, count, factor;
+ unsigned int branch_ct[32][2];
+ vp9_prob ymode_probs[VP9_YMODES - 1];
+ vp9_prob uvmode_probs[VP9_UV_MODES - 1];
+ vp9_prob bmode_probs[VP9_BINTRAMODES - 1];
+ vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
+ vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
+ vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
+#ifdef MODE_COUNT_TESTING
+ printf("static const unsigned int\nymode_counts"
+ "[VP9_YMODES] = {\n");
+ for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
+ printf("};\n");
+ printf("static const unsigned int\nuv_mode_counts"
+ "[VP9_YMODES] [VP9_UV_MODES] = {\n");
+ for (i = 0; i < VP9_YMODES; ++i) {
+ printf(" {");
+ for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
+ printf("},\n");
+ }
+ printf("};\n");
+ printf("static const unsigned int\nbmode_counts"
+ "[VP9_BINTRAMODES] = {\n");
+ for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);
+ printf("};\n");
+ printf("static const unsigned int\ni8x8_mode_counts"
+ "[VP9_I8X8_MODES] = {\n");
+ for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
+ printf("};\n");
+ printf("static const unsigned int\nsub_mv_ref_counts"
+ "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
+ for (i = 0; i < SUBMVREF_COUNT; ++i) {
+ printf(" {");
+ for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
+ printf("},\n");
+ }
+ printf("};\n");
+ printf("static const unsigned int\nmbsplit_counts"
+ "[VP9_NUMMBSPLITS] = {\n");
+ for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
+ printf("};\n");
+#endif
+ vp9_tree_probs_from_distribution(
+ VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+ ymode_probs, branch_ct, cm->fc.ymode_counts,
+ 256, 1);
+ for (t = 0; t < VP9_YMODES - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
+ (int)ymode_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.ymode_prob[t] = 1;
+ else if (prob > 255) cm->fc.ymode_prob[t] = 255;
+ else cm->fc.ymode_prob[t] = prob;
+ }
+ for (i = 0; i < VP9_YMODES; ++i) {
+ vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
+ vp9_uv_mode_tree, uvmode_probs, branch_ct,
+ cm->fc.uv_mode_counts[i], 256, 1);
+ for (t = 0; t < VP9_UV_MODES - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
+ (int)uvmode_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;
+ else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;
+ else cm->fc.uv_mode_prob[i][t] = prob;
+ }
+ }
+ vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,
+ vp9_bmode_tree, bmode_probs, branch_ct,
+ cm->fc.bmode_counts, 256, 1);
+ for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
+ (int)bmode_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.bmode_prob[t] = 1;
+ else if (prob > 255) cm->fc.bmode_prob[t] = 255;
+ else cm->fc.bmode_prob[t] = prob;
+ }
+ vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+ vp9_i8x8_mode_tree, i8x8_mode_probs,
+ branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
+ for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
+ (int)i8x8_mode_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;
+ else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;
+ else cm->fc.i8x8_mode_prob[t] = prob;
+ }
+ for (i = 0; i < SUBMVREF_COUNT; ++i) {
+ vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
+ vp9_sub_mv_ref_encoding_array,
+ vp9_sub_mv_ref_tree, sub_mv_ref_probs,
+ branch_ct, cm->fc.sub_mv_ref_counts[i],
+ 256, 1);
+ for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
+ (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;
+ else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;
+ else cm->fc.sub_mv_ref_prob[i][t] = prob;
+ }
+ }
+ vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
+ vp9_mbsplit_tree, mbsplit_probs, branch_ct,
+ cm->fc.mbsplit_counts, 256, 1);
+ for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
+ int prob;
+ count = branch_ct[t][0] + branch_ct[t][1];
+ count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+ factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+ prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
+ (int)mbsplit_probs[t] * factor + 128) >> 8;
+ if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;
+ else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;
+ else cm->fc.mbsplit_prob[t] = prob;
+ }
+}
--- /dev/null
+++ b/vp9/common/entropymode.h
@@ -1,0 +1,102 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMODE_H
+#define __INC_ENTROPYMODE_H
+
+#include "blockd.h"
+#include "treecoder.h"
+
+#define SUBMVREF_COUNT 5
+#define VP9_NUMMBSPLITS 4
+
+typedef const int vp9_mbsplit[16];
+
+extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
+
+extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS]; /* # of subsets */
+
+extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
+
+extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
+
+extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];
+
+extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+
+extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]
+ [VP9_BINTRAMODES]
+ [VP9_BINTRAMODES];
+
+extern const vp9_tree_index vp9_bmode_tree[];
+
+extern const vp9_tree_index vp9_ymode_tree[];
+extern const vp9_tree_index vp9_kf_ymode_tree[];
+extern const vp9_tree_index vp9_uv_mode_tree[];
+#define vp9_sb_ymode_tree vp9_uv_mode_tree
+extern const vp9_tree_index vp9_i8x8_mode_tree[];
+extern const vp9_tree_index vp9_mbsplit_tree[];
+extern const vp9_tree_index vp9_mv_ref_tree[];
+extern const vp9_tree_index vp9_sb_mv_ref_tree[];
+extern const vp9_tree_index vp9_sub_mv_ref_tree[];
+
+extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];
+extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
+extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
+extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
+extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
+extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
+extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
+extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
+extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
+
+void vp9_entropy_mode_init(void);
+
+struct VP9Common;
+
+void vp9_init_mbmode_probs(struct VP9Common *x);
+
+extern void vp9_init_mode_contexts(struct VP9Common *pc);
+
+extern void vp9_update_mode_context(struct VP9Common *pc);
+
+extern void vp9_accum_mv_refs(struct VP9Common *pc,
+ MB_PREDICTION_MODE m,
+ const int ct[4]);
+
+void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);
+
+void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]
+ [VP9_BINTRAMODES - 1]);
+
+void vp9_adapt_mode_probs(struct VP9Common *);
+
+#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
+
+extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
+ [VP9_SWITCHABLE_FILTERS];
+
+extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
+
+extern const vp9_tree_index vp9_switchable_interp_tree
+ [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+
+extern struct vp9_token_struct vp9_switchable_interp_encodings
+ [VP9_SWITCHABLE_FILTERS];
+
+extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS - 1];
+
+#endif
--- /dev/null
+++ b/vp9/common/entropymv.c
@@ -1,0 +1,465 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "entropymv.h"
+
+//#define MV_COUNT_TESTING
+
+#define MV_COUNT_SAT 16
+#define MV_MAX_UPDATE_FACTOR 160
+
+#if CONFIG_NEW_MVREF
+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
+#define COMPANDED_MVREF_THRESH 1000000
+#else
+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
+#define COMPANDED_MVREF_THRESH 8
+#endif
+
+/* Smooth or bias the mv-counts before prob computation */
+/* #define SMOOTH_MV_COUNTS */
+
+const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+ -MV_JOINT_ZERO, 2,
+ -MV_JOINT_HNZVZ, 4,
+ -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
+};
+struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];
+
+const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+ -MV_CLASS_0, 2,
+ -MV_CLASS_1, 4,
+ 6, 8,
+ -MV_CLASS_2, -MV_CLASS_3,
+ 10, 12,
+ -MV_CLASS_4, -MV_CLASS_5,
+ -MV_CLASS_6, -MV_CLASS_7,
+};
+struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
+
+const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
+ -0, -1,
+};
+struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+
+const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
+ -0, 2,
+ -1, 4,
+ -2, -3
+};
+struct vp9_token_struct vp9_mv_fp_encodings[4];
+
+const nmv_context vp9_default_nmv_context = {
+ {32, 64, 96},
+ {
+ { /* vert component */
+ 128, /* sign */
+ {224, 144, 192, 168, 192, 176, 192}, /* class */
+ {216}, /* class0 */
+ {136, 140, 148, 160, 176, 192, 224}, /* bits */
+ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
+ {64, 96, 64}, /* fp */
+ 160, /* class0_hp bit */
+ 128, /* hp */
+ },
+ { /* hor component */
+ 128, /* sign */
+ {216, 128, 176, 160, 176, 176, 192}, /* class */
+ {208}, /* class0 */
+ {136, 140, 148, 160, 176, 192, 224}, /* bits */
+ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
+ {64, 96, 64}, /* fp */
+ 160, /* class0_hp bit */
+ 128, /* hp */
+ }
+ },
+};
+
+MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {
+ if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;
+ else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;
+ else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;
+ else return MV_JOINT_HNZVNZ;
+}
+
+#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
+
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
+ MV_CLASS_TYPE c;
+ if (z < CLASS0_SIZE * 8) c = MV_CLASS_0;
+ else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1;
+ else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2;
+ else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3;
+ else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4;
+ else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5;
+ else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6;
+ else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
+ else assert(0);
+ if (offset)
+ *offset = z - mv_class_base(c);
+ return c;
+}
+
+int vp9_use_nmv_hp(const MV *ref) {
+ if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+ (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)
+ return 1;
+ else
+ return 0;
+}
+
+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
+ return mv_class_base(c) + offset;
+}
+
+static void increment_nmv_component_count(int v,
+ nmv_component_counts *mvcomp,
+ int incr,
+ int usehp) {
+ assert (v != 0); /* should not be zero */
+ mvcomp->mvcount[MV_MAX + v] += incr;
+}
+
+static void increment_nmv_component(int v,
+ nmv_component_counts *mvcomp,
+ int incr,
+ int usehp) {
+ int s, z, c, o, d, e, f;
+ assert (v != 0); /* should not be zero */
+ s = v < 0;
+ mvcomp->sign[s] += incr;
+ z = (s ? -v : v) - 1; /* magnitude - 1 */
+
+ c = vp9_get_mv_class(z, &o);
+ mvcomp->classes[c] += incr;
+
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+ if (c == MV_CLASS_0) {
+ mvcomp->class0[d] += incr;
+ } else {
+ int i, b;
+ b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i)
+ mvcomp->bits[i][((d >> i) & 1)] += incr;
+ }
+
+ /* Code the fractional pel bits */
+ if (c == MV_CLASS_0) {
+ mvcomp->class0_fp[d][f] += incr;
+ } else {
+ mvcomp->fp[f] += incr;
+ }
+
+ /* Code the high precision bit */
+ if (usehp) {
+ if (c == MV_CLASS_0) {
+ mvcomp->class0_hp[e] += incr;
+ } else {
+ mvcomp->hp[e] += incr;
+ }
+ }
+}
+
+#ifdef SMOOTH_MV_COUNTS
+static void smooth_counts(nmv_component_counts *mvcomp) {
+ static const int flen = 3; // (filter_length + 1) / 2
+ static const int fval[] = {8, 3, 1};
+ static const int fvalbits = 4;
+ int i;
+ unsigned int smvcount[MV_VALS];
+ vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
+ smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
+ for (i = flen - 1; i <= MV_VALS - flen; ++i) {
+ int j, s = smvcount[i] * fval[0];
+ for (j = 1; j < flen; ++j)
+ s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
+ mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
+ }
+}
+#endif
+
+static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
+ int v;
+ vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
+ for (v = 1; v <= MV_MAX; v++) {
+ increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+ increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+ }
+}
+
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+ int usehp) {
+ MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+ mvctx->joints[j]++;
+ usehp = usehp && vp9_use_nmv_hp(ref);
+ if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+ increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+ }
+ if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+ increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+ }
+}
+
+static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
+ unsigned int ct[2]) {
+ int factor;
+ int prob;
+ int count = ct[0] + ct[1];
+ if (count) {
+ count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
+ factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
+ prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
+ prob += !prob;
+ prob = (prob > 255 ? 255 : prob);
+ *dest = prob;
+ }
+}
+
+void vp9_counts_to_nmv_context(
+ nmv_context_counts *NMVcount,
+ nmv_context *prob,
+ int usehp,
+ unsigned int (*branch_ct_joint)[2],
+ unsigned int (*branch_ct_sign)[2],
+ unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+ unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+ unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+ unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+ unsigned int (*branch_ct_fp)[4 - 1][2],
+ unsigned int (*branch_ct_class0_hp)[2],
+ unsigned int (*branch_ct_hp)[2]) {
+ int i, j, k;
+ counts_to_context(&NMVcount->comps[0], usehp);
+ counts_to_context(&NMVcount->comps[1], usehp);
+ vp9_tree_probs_from_distribution(MV_JOINTS,
+ vp9_mv_joint_encodings,
+ vp9_mv_joint_tree,
+ prob->joints,
+ branch_ct_joint,
+ NMVcount->joints,
+ 256, 1);
+ for (i = 0; i < 2; ++i) {
+ prob->comps[i].sign =
+ vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
+ branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
+ branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
+ vp9_tree_probs_from_distribution(MV_CLASSES,
+ vp9_mv_class_encodings,
+ vp9_mv_class_tree,
+ prob->comps[i].classes,
+ branch_ct_classes[i],
+ NMVcount->comps[i].classes,
+ 256, 1);
+ vp9_tree_probs_from_distribution(CLASS0_SIZE,
+ vp9_mv_class0_encodings,
+ vp9_mv_class0_tree,
+ prob->comps[i].class0,
+ branch_ct_class0[i],
+ NMVcount->comps[i].class0,
+ 256, 1);
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
+ NMVcount->comps[i].bits[j]);
+ branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
+ branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (k = 0; k < CLASS0_SIZE; ++k) {
+ vp9_tree_probs_from_distribution(4,
+ vp9_mv_fp_encodings,
+ vp9_mv_fp_tree,
+ prob->comps[i].class0_fp[k],
+ branch_ct_class0_fp[i][k],
+ NMVcount->comps[i].class0_fp[k],
+ 256, 1);
+ }
+ vp9_tree_probs_from_distribution(4,
+ vp9_mv_fp_encodings,
+ vp9_mv_fp_tree,
+ prob->comps[i].fp,
+ branch_ct_fp[i],
+ NMVcount->comps[i].fp,
+ 256, 1);
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
+ NMVcount->comps[i].class0_hp);
+ branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
+ branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];
+
+ prob->comps[i].hp =
+ vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
+ branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
+ branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
+ }
+ }
+}
+
+void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+ int i, j, k;
+ nmv_context prob;
+ unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+ unsigned int branch_ct_sign[2][2];
+ unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+ unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+ unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+ unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+ unsigned int branch_ct_fp[2][4 - 1][2];
+ unsigned int branch_ct_class0_hp[2][2];
+ unsigned int branch_ct_hp[2][2];
+#ifdef MV_COUNT_TESTING
+ printf("joints count: ");
+ for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
+ printf("\n"); fflush(stdout);
+ printf("signs count:\n");
+ for (i = 0; i < 2; ++i)
+ printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
+ printf("\n"); fflush(stdout);
+ printf("classes count:\n");
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < MV_CLASSES; ++j)
+ printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
+ printf("\n"); fflush(stdout);
+ }
+ printf("class0 count:\n");
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
+ printf("\n"); fflush(stdout);
+ }
+ printf("bits count:\n");
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
+ cm->fc.NMVcount.comps[i].bits[j][1]);
+ printf("\n"); fflush(stdout);
+ }
+ printf("class0_fp count:\n");
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ printf("{");
+ for (k = 0; k < 4; ++k)
+ printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
+ printf("}, ");
+ }
+ printf("\n"); fflush(stdout);
+ }
+ printf("fp count:\n");
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < 4; ++j)
+ printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
+ printf("\n"); fflush(stdout);
+ }
+ if (usehp) {
+ printf("class0_hp count:\n");
+ for (i = 0; i < 2; ++i)
+ printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
+ cm->fc.NMVcount.comps[i].class0_hp[1]);
+ printf("\n"); fflush(stdout);
+ printf("hp count:\n");
+ for (i = 0; i < 2; ++i)
+ printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
+ cm->fc.NMVcount.comps[i].hp[1]);
+ printf("\n"); fflush(stdout);
+ }
+#endif
+#ifdef SMOOTH_MV_COUNTS
+ smooth_counts(&cm->fc.NMVcount.comps[0]);
+ smooth_counts(&cm->fc.NMVcount.comps[1]);
+#endif
+ vp9_counts_to_nmv_context(&cm->fc.NMVcount,
+ &prob,
+ usehp,
+ branch_ct_joint,
+ branch_ct_sign,
+ branch_ct_classes,
+ branch_ct_class0,
+ branch_ct_bits,
+ branch_ct_class0_fp,
+ branch_ct_fp,
+ branch_ct_class0_hp,
+ branch_ct_hp);
+
+ for (j = 0; j < MV_JOINTS - 1; ++j) {
+ adapt_prob(&cm->fc.nmvc.joints[j],
+ cm->fc.pre_nmvc.joints[j],
+ prob.joints[j],
+ branch_ct_joint[j]);
+ }
+ for (i = 0; i < 2; ++i) {
+ adapt_prob(&cm->fc.nmvc.comps[i].sign,
+ cm->fc.pre_nmvc.comps[i].sign,
+ prob.comps[i].sign,
+ branch_ct_sign[i]);
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ adapt_prob(&cm->fc.nmvc.comps[i].classes[j],
+ cm->fc.pre_nmvc.comps[i].classes[j],
+ prob.comps[i].classes[j],
+ branch_ct_classes[i][j]);
+ }
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ adapt_prob(&cm->fc.nmvc.comps[i].class0[j],
+ cm->fc.pre_nmvc.comps[i].class0[j],
+ prob.comps[i].class0[j],
+ branch_ct_class0[i][j]);
+ }
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
+ cm->fc.pre_nmvc.comps[i].bits[j],
+ prob.comps[i].bits[j],
+ branch_ct_bits[i][j]);
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ for (k = 0; k < 3; ++k) {
+ adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],
+ cm->fc.pre_nmvc.comps[i].class0_fp[j][k],
+ prob.comps[i].class0_fp[j][k],
+ branch_ct_class0_fp[i][j][k]);
+ }
+ }
+ for (j = 0; j < 3; ++j) {
+ adapt_prob(&cm->fc.nmvc.comps[i].fp[j],
+ cm->fc.pre_nmvc.comps[i].fp[j],
+ prob.comps[i].fp[j],
+ branch_ct_fp[i][j]);
+ }
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
+ cm->fc.pre_nmvc.comps[i].class0_hp,
+ prob.comps[i].class0_hp,
+ branch_ct_class0_hp[i]);
+ adapt_prob(&cm->fc.nmvc.comps[i].hp,
+ cm->fc.pre_nmvc.comps[i].hp,
+ prob.comps[i].hp,
+ branch_ct_hp[i]);
+ }
+ }
+}
+
+void vp9_entropy_mv_init() {
+ vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
+ vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
+ vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
+ vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
+}
+
+void vp9_init_mv_probs(VP9_COMMON *cm) {
+ vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
+}
--- /dev/null
+++ b/vp9/common/entropymv.h
@@ -1,0 +1,129 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMV_H
+#define __INC_ENTROPYMV_H
+
+#include "treecoder.h"
+#include "vpx_config.h"
+#include "blockd.h"
+
+struct VP9Common;
+
+void vp9_entropy_mv_init();
+void vp9_init_mv_probs(struct VP9Common *cm);
+
+void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_nmv_hp(const MV *ref);
+
+#define VP9_NMV_UPDATE_PROB 255
+//#define MV_GROUP_UPDATE
+
+#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+typedef enum {
+ MV_JOINT_ZERO = 0, /* Zero vector */
+ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
+ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
+ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
+extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 8
+typedef enum {
+ MV_CLASS_0 = 0, /* (0, 2] integer pel */
+ MV_CLASS_1 = 1, /* (2, 4] integer pel */
+ MV_CLASS_2 = 2, /* (4, 8] integer pel */
+ MV_CLASS_3 = 3, /* (8, 16] integer pel */
+ MV_CLASS_4 = 4, /* (16, 32] integer pel */
+ MV_CLASS_5 = 5, /* (32, 64] integer pel */
+ MV_CLASS_6 = 6, /* (64, 128] integer pel */
+ MV_CLASS_7 = 7, /* (128, 256] integer pel */
+} MV_CLASS_TYPE;
+
+extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
+extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+
+extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern struct vp9_token_struct vp9_mv_fp_encodings[4];
+
+typedef struct {
+ vp9_prob sign;
+ vp9_prob classes[MV_CLASSES - 1];
+ vp9_prob class0[CLASS0_SIZE - 1];
+ vp9_prob bits[MV_OFFSET_BITS];
+ vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
+ vp9_prob fp[4 - 1];
+ vp9_prob class0_hp;
+ vp9_prob hp;
+} nmv_component;
+
+typedef struct {
+ vp9_prob joints[MV_JOINTS - 1];
+ nmv_component comps[2];
+} nmv_context;
+
+MV_JOINT_TYPE vp9_get_mv_joint(MV mv);
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
+
+
+typedef struct {
+ unsigned int mvcount[MV_VALS];
+ unsigned int sign[2];
+ unsigned int classes[MV_CLASSES];
+ unsigned int class0[CLASS0_SIZE];
+ unsigned int bits[MV_OFFSET_BITS][2];
+ unsigned int class0_fp[CLASS0_SIZE][4];
+ unsigned int fp[4];
+ unsigned int class0_hp[2];
+ unsigned int hp[2];
+} nmv_component_counts;
+
+typedef struct {
+ unsigned int joints[MV_JOINTS];
+ nmv_component_counts comps[2];
+} nmv_context_counts;
+
+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
+ int usehp);
+extern const nmv_context vp9_default_nmv_context;
+void vp9_counts_to_nmv_context(
+ nmv_context_counts *NMVcount,
+ nmv_context *prob,
+ int usehp,
+ unsigned int (*branch_ct_joint)[2],
+ unsigned int (*branch_ct_sign)[2],
+ unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+ unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+ unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+ unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+ unsigned int (*branch_ct_fp)[4 - 1][2],
+ unsigned int (*branch_ct_class0_hp)[2],
+ unsigned int (*branch_ct_hp)[2]);
+
+#endif
--- /dev/null
+++ b/vp9/common/extend.c
@@ -1,0 +1,169 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+static void copy_and_extend_plane(unsigned char *s, /* source */
+ int sp, /* source pitch */
+ unsigned char *d, /* destination */
+ int dp, /* destination pitch */
+ int h, /* height */
+ int w, /* width */
+ int et, /* extend top border */
+ int el, /* extend left border */
+ int eb, /* extend bottom border */
+ int er) { /* extend right border */
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+ int linesize;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = s;
+ src_ptr2 = s + w - 1;
+ dest_ptr1 = d - el;
+ dest_ptr2 = d + w;
+
+ for (i = 0; i < h; i++) {
+ vpx_memset(dest_ptr1, src_ptr1[0], el);
+ vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
+ vpx_memset(dest_ptr2, src_ptr2[0], er);
+ src_ptr1 += sp;
+ src_ptr2 += sp;
+ dest_ptr1 += dp;
+ dest_ptr2 += dp;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = d - el;
+ src_ptr2 = d + dp * (h - 1) - el;
+ dest_ptr1 = d + dp * (-et) - el;
+ dest_ptr2 = d + dp * (h) - el;
+ linesize = el + er + w;
+
+ for (i = 0; i < et; i++) {
+ vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+ dest_ptr1 += dp;
+ }
+
+ for (i = 0; i < eb; i++) {
+ vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+ dest_ptr2 += dp;
+ }
+}
+
+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride,
+ dst->y_buffer, dst->y_stride,
+ src->y_height, src->y_width,
+ et, el, eb, er);
+
+ et = dst->border >> 1;
+ el = dst->border >> 1;
+ eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+ er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+}
+
+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw) {
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+ int src_y_offset = srcy * src->y_stride + srcx;
+ int dst_y_offset = srcy * dst->y_stride + srcx;
+ int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+ // If the side is not touching the bounder then don't extend.
+ if (srcy)
+ et = 0;
+ if (srcx)
+ el = 0;
+ if (srcy + srch != src->y_height)
+ eb = 0;
+ if (srcx + srcw != src->y_width)
+ er = 0;
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset,
+ src->y_stride,
+ dst->y_buffer + dst_y_offset,
+ dst->y_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ et = (et + 1) >> 1;
+ el = (el + 1) >> 1;
+ eb = (eb + 1) >> 1;
+ er = (er + 1) >> 1;
+ srch = (srch + 1) >> 1;
+ srcw = (srcw + 1) >> 1;
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->u_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->v_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+}
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+ unsigned char *UPtr, unsigned char *VPtr) {
+ int i;
+
+ YPtr += ybf->y_stride * 14;
+ UPtr += ybf->uv_stride * 6;
+ VPtr += ybf->uv_stride * 6;
+
+ for (i = 0; i < 4; i++) {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+
+ YPtr += ybf->y_stride;
+ UPtr += ybf->uv_stride;
+ VPtr += ybf->uv_stride;
+
+ for (i = 0; i < 4; i++) {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+}
--- /dev/null
+++ b/vp9/common/extend.h
@@ -1,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_EXTEND_H
+#define __INC_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
+ unsigned char *UPtr, unsigned char *VPtr);
+
+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw);
+
+#endif // __INC_EXTEND_H
--- /dev/null
+++ b/vp9/common/filter.c
@@ -1,0 +1,1159 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_rtcd.h"
+
+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
+ { 128, 0 },
+ { 120, 8 },
+ { 112, 16 },
+ { 104, 24 },
+ { 96, 32 },
+ { 88, 40 },
+ { 80, 48 },
+ { 72, 56 },
+ { 64, 64 },
+ { 56, 72 },
+ { 48, 80 },
+ { 40, 88 },
+ { 32, 96 },
+ { 24, 104 },
+ { 16, 112 },
+ { 8, 120 }
+};
+
+#define FILTER_ALPHA 0
+#define FILTER_ALPHA_SHARP 1
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+#if FILTER_ALPHA == 0
+ /* Lagrangian interpolation filter */
+ { 0, 0, 0, 128, 0, 0, 0, 0},
+ { 0, 1, -5, 126, 8, -3, 1, 0},
+ { -1, 3, -10, 122, 18, -6, 2, 0},
+ { -1, 4, -13, 118, 27, -9, 3, -1},
+ { -1, 4, -16, 112, 37, -11, 4, -1},
+ { -1, 5, -18, 105, 48, -14, 4, -1},
+ { -1, 5, -19, 97, 58, -16, 5, -1},
+ { -1, 6, -19, 88, 68, -18, 5, -1},
+ { -1, 6, -19, 78, 78, -19, 6, -1},
+ { -1, 5, -18, 68, 88, -19, 6, -1},
+ { -1, 5, -16, 58, 97, -19, 5, -1},
+ { -1, 4, -14, 48, 105, -18, 5, -1},
+ { -1, 4, -11, 37, 112, -16, 4, -1},
+ { -1, 3, -9, 27, 118, -13, 4, -1},
+ { 0, 2, -6, 18, 122, -10, 3, -1},
+ { 0, 1, -3, 8, 126, -5, 1, 0}
+#elif FILTER_ALPHA == 50
+ /* Generated using MATLAB:
+ * alpha = 0.5;
+ * b=intfilt(8,4,alpha);
+ * bi=round(128*b);
+ * ba=flipud(reshape([bi 0], 8, 8));
+ * disp(num2str(ba, '%d,'))
+ */
+ { 0, 0, 0, 128, 0, 0, 0, 0},
+ { 0, 1, -5, 126, 8, -3, 1, 0},
+ { 0, 2, -10, 122, 18, -6, 2, 0},
+ { -1, 3, -13, 118, 27, -9, 3, 0},
+ { -1, 4, -16, 112, 37, -11, 3, 0},
+ { -1, 5, -17, 104, 48, -14, 4, -1},
+ { -1, 5, -18, 96, 58, -16, 5, -1},
+ { -1, 5, -19, 88, 68, -17, 5, -1},
+ { -1, 5, -18, 78, 78, -18, 5, -1},
+ { -1, 5, -17, 68, 88, -19, 5, -1},
+ { -1, 5, -16, 58, 96, -18, 5, -1},
+ { -1, 4, -14, 48, 104, -17, 5, -1},
+ { 0, 3, -11, 37, 112, -16, 4, -1},
+ { 0, 3, -9, 27, 118, -13, 3, -1},
+ { 0, 2, -6, 18, 122, -10, 2, 0},
+ { 0, 1, -3, 8, 126, -5, 1, 0}
+#endif /* FILTER_ALPHA */
+};
+
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
+#if FILTER_ALPHA_SHARP == 1
+ /* dct based filter */
+ {0, 0, 0, 128, 0, 0, 0, 0},
+ {-1, 3, -7, 127, 8, -3, 1, 0},
+ {-2, 5, -13, 125, 17, -6, 3, -1},
+ {-3, 7, -17, 121, 27, -10, 5, -2},
+ {-4, 9, -20, 115, 37, -13, 6, -2},
+ {-4, 10, -23, 108, 48, -16, 8, -3},
+ {-4, 10, -24, 100, 59, -19, 9, -3},
+ {-4, 11, -24, 90, 70, -21, 10, -4},
+ {-4, 11, -23, 80, 80, -23, 11, -4},
+ {-4, 10, -21, 70, 90, -24, 11, -4},
+ {-3, 9, -19, 59, 100, -24, 10, -4},
+ {-3, 8, -16, 48, 108, -23, 10, -4},
+ {-2, 6, -13, 37, 115, -20, 9, -4},
+ {-2, 5, -10, 27, 121, -17, 7, -3},
+ {-1, 3, -6, 17, 125, -13, 5, -2},
+ {0, 1, -3, 8, 127, -7, 3, -1}
+#elif FILTER_ALPHA_SHARP == 75
+ /* alpha = 0.75 */
+ {0, 0, 0, 128, 0, 0, 0, 0},
+ {-1, 2, -6, 126, 9, -3, 2, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1},
+ {-2, 6, -16, 119, 28, -10, 5, -2},
+ {-2, 7, -19, 113, 38, -13, 6, -2},
+ {-3, 8, -21, 106, 49, -16, 7, -2},
+ {-3, 9, -22, 99, 59, -19, 8, -3},
+ {-3, 9, -23, 90, 70, -21, 9, -3},
+ {-3, 9, -22, 80, 80, -22, 9, -3},
+ {-3, 9, -21, 70, 90, -23, 9, -3},
+ {-3, 8, -19, 59, 99, -22, 9, -3},
+ {-2, 7, -16, 49, 106, -21, 8, -3},
+ {-2, 6, -13, 38, 113, -19, 7, -2},
+ {-2, 5, -10, 28, 119, -16, 6, -2},
+ {-1, 3, -7, 18, 123, -11, 4, -1},
+ {-1, 2, -3, 9, 126, -6, 2, -1}
+#endif /* FILTER_ALPHA_SHARP */
+};
+
+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
+ {0, 0, 128, 0, 0, 0},
+ {1, -5, 125, 8, -2, 1},
+ {1, -8, 122, 17, -5, 1},
+ {2, -11, 116, 27, -8, 2},
+ {3, -14, 110, 37, -10, 2},
+ {3, -15, 103, 47, -12, 2},
+ {3, -16, 95, 57, -14, 3},
+ {3, -16, 86, 67, -15, 3},
+ {3, -16, 77, 77, -16, 3},
+ {3, -15, 67, 86, -16, 3},
+ {3, -14, 57, 95, -16, 3},
+ {2, -12, 47, 103, -15, 3},
+ {2, -10, 37, 110, -14, 3},
+ {2, -8, 27, 116, -11, 2},
+ {1, -5, 17, 122, -8, 1},
+ {1, -2, 8, 125, -5, 1}
+};
+
+static void filter_block2d_first_pass_6(unsigned char *src_ptr,
+ int *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+ ((int)src_ptr[0] * vp9_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp9_filter[3]) +
+ ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
+ ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
+ (VP9_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP9_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = Temp;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void filter_block2d_second_pass_6(int *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+ ((int)src_ptr[0] * vp9_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp9_filter[3]) +
+ ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
+ ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
+ (VP9_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP9_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = (unsigned char)Temp;
+ src_ptr++;
+ }
+
+ /* Start next row */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_pitch;
+ }
+}
+
+/*
+ * The only functional difference between filter_block2d_second_pass()
+ * and this function is that filter_block2d_second_pass() does a sixtap
+ * filter on the input and stores it in the output. This function
+ * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
+ * and then averages that with the content already present in the output
+ * ((filter_result + dest + 1) >> 1) and stores that in the output.
+ */
+static void filter_block2d_second_pass_avg_6(int *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
+ ((int)src_ptr[0] * vp9_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp9_filter[3]) +
+ ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
+ ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
+ (VP9_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP9_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);
+ src_ptr++;
+ }
+
+ /* Start next row */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_pitch;
+ }
+}
+
+#define Interp_Extend 3
+static void filter_block2d_6(unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ int output_pitch,
+ const short *HFilter,
+ const short *VFilter) {
+ int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+ 3 + Interp_Extend * 2, 4, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp9_sixtap_predict_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+
+/*
+ * The difference between filter_block2d_6() and filter_block2d_avg_6 is
+ * that filter_block2d_6() does a 6-tap filter and stores it in the output
+ * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
+ * then averages that with the content already present in the output
+ * ((filter_result + dest + 1) >> 1) and stores that in the output.
+ */
+static void filter_block2d_avg_6(unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ int output_pitch,
+ const short *HFilter,
+ const short *VFilter) {
+ int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),
+ FData, src_pixels_per_line, 1,
+ 3 + Interp_Extend * 2, 4, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
+ output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+void vp9_sixtap_predict_avg_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,
+ dst_pitch, HFilter, VFilter);
+}
+
+void vp9_sixtap_predict8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
+ int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+ 7 + Interp_Extend * 2, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp9_sixtap_predict_avg8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
+ int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+ 7 + Interp_Extend * 2, 8, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+}
+
+void vp9_sixtap_predict8x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */
+ int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+ 3 + Interp_Extend * 2, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp9_sixtap_predict16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */
+ int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
+ 15 + Interp_Extend * 2, 16, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+void vp9_sixtap_predict_avg16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+) {
+ const short *HFilter;
+ const short *VFilter;
+ // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */
+ int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
+ VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+ src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,
+ 16, 16, 16, 16, VFilter);
+}
+
+typedef enum {
+ VPX_FILTER_4x4 = 0,
+ VPX_FILTER_8x8 = 1,
+ VPX_FILTER_8x4 = 2,
+ VPX_FILTER_16x16 = 3,
+} filter_size_t;
+
+static const unsigned int filter_size_to_wh[][2] = {
+ {4, 4},
+ {8, 8},
+ {8, 4},
+ {16,16},
+};
+
+static const unsigned int filter_max_height = 16;
+static const unsigned int filter_max_width = 16;
+
+static void filter_block2d_8_c(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *HFilter,
+ const short *VFilter,
+ const filter_size_t filter_size,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ const unsigned int output_width = filter_size_to_wh[filter_size][0];
+ const unsigned int output_height = filter_size_to_wh[filter_size][1];
+
+ // Between passes, we use an intermediate buffer whose height is extended to
+ // have enough horizontally filtered values as input for the vertical pass.
+ // This buffer is allocated to be big enough for the largest block type we
+ // support.
+ const int kInterp_Extend = 4;
+ const unsigned int intermediate_height =
+ (kInterp_Extend - 1) + output_height + kInterp_Extend;
+ const unsigned int max_intermediate_height =
+ (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;
+#ifdef _MSC_VER
+ // MSVC does not support C99 style declaration
+ unsigned char intermediate_buffer[23 * 16];
+#else
+ unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];
+#endif
+ const int intermediate_next_stride = 1 - intermediate_height * output_width;
+
+ // Horizontal pass (src -> transposed intermediate).
+ {
+ unsigned char *output_ptr = intermediate_buffer;
+ const int src_next_row_stride = src_stride - output_width;
+ unsigned int i, j;
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+ for (i = 0; i < intermediate_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ // Apply filter...
+ int temp = ((int)src_ptr[0] * HFilter[0]) +
+ ((int)src_ptr[1] * HFilter[1]) +
+ ((int)src_ptr[2] * HFilter[2]) +
+ ((int)src_ptr[3] * HFilter[3]) +
+ ((int)src_ptr[4] * HFilter[4]) +
+ ((int)src_ptr[5] * HFilter[5]) +
+ ((int)src_ptr[6] * HFilter[6]) +
+ ((int)src_ptr[7] * HFilter[7]) +
+ (VP9_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ temp >>= VP9_FILTER_SHIFT;
+ if (temp < 0) {
+ temp = 0;
+ } else if (temp > 255) {
+ temp = 255;
+ }
+ src_ptr++;
+ *output_ptr = temp;
+ output_ptr += intermediate_height;
+ }
+ src_ptr += src_next_row_stride;
+ output_ptr += intermediate_next_stride;
+ }
+ }
+
+ // Vertical pass (transposed intermediate -> dst).
+ {
+ unsigned char *src_ptr = intermediate_buffer;
+ const int dst_next_row_stride = dst_stride - output_width;
+ unsigned int i, j;
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ // Apply filter...
+ int temp = ((int)src_ptr[0] * VFilter[0]) +
+ ((int)src_ptr[1] * VFilter[1]) +
+ ((int)src_ptr[2] * VFilter[2]) +
+ ((int)src_ptr[3] * VFilter[3]) +
+ ((int)src_ptr[4] * VFilter[4]) +
+ ((int)src_ptr[5] * VFilter[5]) +
+ ((int)src_ptr[6] * VFilter[6]) +
+ ((int)src_ptr[7] * VFilter[7]) +
+ (VP9_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ temp >>= VP9_FILTER_SHIFT;
+ if (temp < 0) {
+ temp = 0;
+ } else if (temp > 255) {
+ temp = 255;
+ }
+
+ src_ptr += intermediate_height;
+ *dst_ptr++ = (unsigned char)temp;
+ }
+ src_ptr += intermediate_next_stride;
+ dst_ptr += dst_next_row_stride;
+ }
+ }
+}
+
+void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *HFilter_aligned16,
+ const short *VFilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ filter_block2d_8_c(src_ptr, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ VPX_FILTER_4x4, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *HFilter_aligned16,
+ const short *VFilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ filter_block2d_8_c(src_ptr, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ VPX_FILTER_8x4, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *HFilter_aligned16,
+ const short *VFilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ filter_block2d_8_c(src_ptr, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ VPX_FILTER_8x8, dst_ptr, dst_stride);
+}
+
+void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *HFilter_aligned16,
+ const short *VFilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ filter_block2d_8_c(src_ptr, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ VPX_FILTER_16x16, dst_ptr, dst_stride);
+}
+
+static void block2d_average_c(unsigned char *src,
+ unsigned int src_stride,
+ unsigned char *output_ptr,
+ unsigned int output_stride,
+ const filter_size_t filter_size) {
+ const unsigned int output_width = filter_size_to_wh[filter_size][0];
+ const unsigned int output_height = filter_size_to_wh[filter_size][1];
+
+ unsigned int i, j;
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+ }
+ output_ptr += output_stride;
+ }
+}
+
+#define block2d_average block2d_average_c
+
+void vp9_eighttap_predict_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_sub_pel_filters_8[xoffset];
+ VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg4x4_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+ unsigned char tmp[4 * 4];
+
+ vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 4);
+ block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
+}
+
+void vp9_eighttap_predict_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_sub_pel_filters_8s[xoffset];
+ VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+ unsigned char tmp[4 * 4];
+
+ vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 4);
+ block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
+}
+
+void vp9_eighttap_predict8x8_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict8x8_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg8x8_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char tmp[8 * 8];
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 8);
+ block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
+}
+
+void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ unsigned char tmp[8 * 8];
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 8);
+ block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
+}
+
+void vp9_eighttap_predict8x4_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict8x4_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict16x16_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict16x16_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg16x16_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
+ const short *HFilter = vp9_sub_pel_filters_8[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+ vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 16);
+ block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
+}
+
+void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
+ const short *HFilter = vp9_sub_pel_filters_8s[xoffset];
+ const short *VFilter = vp9_sub_pel_filters_8s[yoffset];
+
+ vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
+ HFilter, VFilter,
+ tmp, 16);
+ block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_stride : Stride of source block.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the horizontal direction to produce the filtered output
+ * block. Used to implement first-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass(unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_stride,
+ unsigned int height,
+ unsigned int width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ /* Apply bilinear filter */
+ dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
+ ((int)src_ptr[1] * vp9_filter[1]) +
+ (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride - width;
+ dst_ptr += width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 dst_pitch : Destination block pitch.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the vertical direction to produce the filtered output
+ * block. Used to implement second-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp9_filter[0]) +
+ ((int)src_ptr[width] * vp9_filter[1]) +
+ (VP9_FILTER_WEIGHT / 2);
+ dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ dst_ptr += dst_pitch;
+ }
+}
+
+/*
+ * As before for filter_block2d_second_pass_avg(), the functional difference
+ * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
+ * is that filter_block2d_bil_second_pass() does a bilinear filter on input
+ * and stores the result in output; filter_block2d_bil_second_pass_avg(),
+ * instead, does a bilinear filter on input, averages the resulting value
+ * with the values already present in the output and stores the result of
+ * that back into the output ((filter_result + dest + 1) >> 1).
+ */
+static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp9_filter[0]) +
+ ((int)src_ptr[width] * vp9_filter[1]) +
+ (VP9_FILTER_WEIGHT / 2);
+ dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ dst_ptr += dst_pitch;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pitch : Stride of source block.
+ * UINT32 dst_pitch : Stride of destination block.
+ * INT32 *HFilter : Array of 2 horizontal filter taps.
+ * INT32 *VFilter : Array of 2 vertical filter taps.
+ * INT32 Width : Block width
+ * INT32 Height : Block height
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 2-D filters an input block by applying a 2-tap
+ * bi-linear filter horizontally followed by a 2-tap
+ * bi-linear filter vertically on the result.
+ *
+ * SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height) {
+
+ unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+static void filter_block2d_bil_avg(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height) {
+ unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+void vp9_bilinear_predict4x4_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict_avg4x4_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+ dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp9_bilinear_predict8x8_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp9_bilinear_predict_avg8x8_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+ dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp9_bilinear_predict8x4_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp9_bilinear_predict16x16_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
+
+void vp9_bilinear_predict_avg16x16_c(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+ dst_pitch, HFilter, VFilter, 16, 16);
+}
--- /dev/null
+++ b/vp9/common/filter.h
@@ -1,0 +1,28 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT 7
+
+#define SUBPEL_SHIFTS 16
+
+extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
+extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
+extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+
+#endif // FILTER_H
--- /dev/null
+++ b/vp9/common/findnearmv.c
@@ -1,0 +1,327 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+#include "vp9/common/sadmxn.h"
+#include <limits.h>
+
+const unsigned char vp9_mbsplit_offset[4][16] = {
+ { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+static void lower_mv_precision(int_mv *mv, int usehp)
+{
+ if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+ if (mv->as_mv.row & 1)
+ mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
+ if (mv->as_mv.col & 1)
+ mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
+ }
+}
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+ Note that we only consider one 4x4 subblock from each candidate 16x16
+ macroblock. */
+
+void vp9_find_near_mvs
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ const MODE_INFO *lf_here,
+ int_mv *nearest,
+ int_mv *nearby,
+ int_mv *best_mv,
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias) {
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ const MODE_INFO *third = NULL;
+ int_mv near_mvs[4];
+ int_mv *mv = near_mvs;
+ int *cntx = cnt;
+ enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+ /* Zero accumulators */
+ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME) {
+ if (above->mbmi.mv[0].as_int) {
+ ++ mv;
+ mv->as_int = above->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+ refframe, mv, ref_frame_sign_bias);
+ ++cntx;
+ }
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME) {
+ if (left->mbmi.mv[0].as_int) {
+ int_mv this_mv;
+ this_mv.as_int = left->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+ refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int) {
+ ++ mv;
+ mv->as_int = this_mv.as_int;
+ ++ cntx;
+ }
+ *cntx += 2;
+ } else
+ cnt[CNT_INTRA] += 2;
+ }
+ /* Process above left or the one from last frame */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||
+ (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
+ if (aboveleft->mbmi.mv[0].as_int) {
+ third = aboveleft;
+ } else if (lf_here->mbmi.mv[0].as_int) {
+ third = lf_here;
+ }
+ if (third) {
+ int_mv this_mv;
+ this_mv.as_int = third->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],
+ refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int) {
+ ++ mv;
+ mv->as_int = this_mv.as_int;
+ ++ cntx;
+ }
+ *cntx += 1;
+ } else
+ cnt[CNT_INTRA] += 1;
+ }
+
+ /* If we have three distinct MV's ... */
+ if (cnt[CNT_SPLITMV]) {
+ /* See if the third MV can be merged with NEAREST */
+ if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+ cnt[CNT_NEAREST] += 1;
+ }
+
+ cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ + (left->mbmi.mode == SPLITMV)) * 2
+ + (
+ lf_here->mbmi.mode == SPLITMV ||
+ aboveleft->mbmi.mode == SPLITMV);
+
+ /* Swap near and nearest if necessary */
+ if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
+ int tmp;
+ tmp = cnt[CNT_NEAREST];
+ cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+ cnt[CNT_NEAR] = tmp;
+ tmp = near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = tmp;
+ }
+
+ /* Use near_mvs[0] to store the "best" MV */
+ if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+ near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+ /* Set up return values */
+ best_mv->as_int = near_mvs[0].as_int;
+ nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+ nearby->as_int = near_mvs[CNT_NEAR].as_int;
+
+ /* Make sure that the 1/8th bits of the Mvs are zero if high_precision
+ * is not being used, by truncating the last bit towards 0
+ */
+ lower_mv_precision(best_mv, xd->allow_high_precision_mv);
+ lower_mv_precision(nearest, xd->allow_high_precision_mv);
+ lower_mv_precision(nearby, xd->allow_high_precision_mv);
+
+ // TODO: move clamp outside findnearmv
+ clamp_mv2(nearest, xd);
+ clamp_mv2(nearby, xd);
+ clamp_mv2(best_mv, xd);
+}
+
+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
+ vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]
+ ) {
+ p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+ p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+ p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+ p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+ return p;
+}
+
+#if CONFIG_NEWBESTREFMV
+#define SP(x) (((x) & 7) << 1)
+unsigned int vp9_sad3x16_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
+}
+unsigned int vp9_sad16x3_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
+}
+
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
+ unsigned char *ref_y_buffer,
+ int ref_y_stride,
+ int_mv *mvlist,
+ int_mv *best_mv,
+ int_mv *nearest,
+ int_mv *near) {
+ int i, j;
+ unsigned char *above_src;
+ unsigned char *left_src;
+ unsigned char *above_ref;
+ unsigned char *left_ref;
+ int score;
+ int sse;
+ int ref_scores[MAX_MV_REFS] = {0};
+ int_mv sorted_mvs[MAX_MV_REFS];
+ int zero_seen = FALSE;
+
+ // Default all to 0,0 if nothing else available
+ best_mv->as_int = nearest->as_int = near->as_int = 0;
+ vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+
+#if CONFIG_SUBPELREFMV
+ above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+ left_src = xd->dst.y_buffer - 2;
+ above_ref = ref_y_buffer - ref_y_stride * 2;
+ left_ref = ref_y_buffer - 2;
+#else
+ above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
+ left_src = xd->dst.y_buffer - 3;
+ above_ref = ref_y_buffer - ref_y_stride * 3;
+ left_ref = ref_y_buffer - 3;
+#endif
+
+ //for(i = 0; i < MAX_MV_REFS; ++i) {
+ // Limit search to the predicted best 4
+ for(i = 0; i < 4; ++i) {
+ int_mv this_mv;
+ int offset = 0;
+ int row_offset, col_offset;
+
+ this_mv.as_int = mvlist[i].as_int;
+
+ // If we see a 0,0 vector for a second time we have reached the end of
+ // the list of valid candidate vectors.
+ if (!this_mv.as_int && zero_seen)
+ break;
+
+ zero_seen = zero_seen || !this_mv.as_int;
+
+ clamp_mv(&this_mv,
+ xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#if CONFIG_SUBPELREFMV
+ row_offset = this_mv.as_mv.row >> 3;
+ col_offset = this_mv.as_mv.col >> 3;
+ offset = ref_y_stride * row_offset + col_offset;
+ score = 0;
+ if (xd->up_available) {
+ vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ above_src, xd->dst.y_stride, &sse);
+ score += sse;
+ }
+ if (xd->left_available) {
+ vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ left_src, xd->dst.y_stride, &sse);
+ score += sse;
+ }
+#else
+ row_offset = (this_mv.as_mv.row > 0) ?
+ ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+ col_offset = (this_mv.as_mv.col > 0) ?
+ ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+ offset = ref_y_stride * row_offset + col_offset;
+ score = 0;
+ if (xd->up_available) {
+ score += vp9_sad16x3(above_src, xd->dst.y_stride,
+ above_ref + offset, ref_y_stride, INT_MAX);
+ }
+ if (xd->left_available) {
+ score += vp9_sad3x16(left_src, xd->dst.y_stride,
+ left_ref + offset, ref_y_stride, INT_MAX);
+ }
+#endif
+ // Add the entry to our list and then resort the list on score.
+ ref_scores[i] = score;
+ sorted_mvs[i].as_int = this_mv.as_int;
+ j = i;
+ while (j > 0) {
+ if (ref_scores[j] < ref_scores[j-1]) {
+ ref_scores[j] = ref_scores[j-1];
+ sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+ ref_scores[j-1] = score;
+ sorted_mvs[j-1].as_int = this_mv.as_int;
+ j--;
+ } else
+ break;
+ }
+ }
+
+ // Make sure all the candidates are properly clamped etc
+ for (i = 0; i < 4; ++i) {
+ lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
+ clamp_mv2(&sorted_mvs[i], xd);
+ }
+
+ // Set the best mv to the first entry in the sorted list
+ best_mv->as_int = sorted_mvs[0].as_int;
+
+ // Provided that there are non zero vectors available there will not
+ // be more than one 0,0 entry in the sorted list.
+ // The best ref mv is always set to the first entry (which gave the best
+ // results. The nearest is set to the first non zero vector if available and
+ // near to the second non zero vector if available.
+ // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
+ if ( sorted_mvs[0].as_int ) {
+ nearest->as_int = sorted_mvs[0].as_int;
+ if ( sorted_mvs[1].as_int )
+ near->as_int = sorted_mvs[1].as_int;
+ else
+ near->as_int = sorted_mvs[2].as_int;
+ } else {
+ nearest->as_int = sorted_mvs[1].as_int;
+ near->as_int = sorted_mvs[2].as_int;
+ }
+
+ // Copy back the re-ordered mv list
+ vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
+}
+
+#endif // CONFIG_NEWBESTREFMV
--- /dev/null
+++ b/vp9/common/findnearmv.h
@@ -1,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_FINDNEARMV_H
+#define __INC_FINDNEARMV_H
+
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+#include "onyxc_int.h"
+
+#if CONFIG_NEWBESTREFMV
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
+ unsigned char *ref_y_buffer,
+ int ref_y_stride,
+ int_mv *mvlist,
+ int_mv *best_mv,
+ int_mv *nearest,
+ int_mv *near);
+#endif
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {
+ MV xmv;
+ xmv = mvp->as_mv;
+
+ if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
+ xmv.row *= -1;
+ xmv.col *= -1;
+ }
+
+ mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+
+static void clamp_mv(int_mv *mv,
+ int mb_to_left_edge,
+ int mb_to_right_edge,
+ int mb_to_top_edge,
+ int mb_to_bottom_edge) {
+ mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+ mb_to_left_edge : mv->as_mv.col;
+ mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+ mb_to_right_edge : mv->as_mv.col;
+ mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+ mb_to_top_edge : mv->as_mv.row;
+ mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+ mb_to_bottom_edge : mv->as_mv.row;
+}
+
+static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+ clamp_mv(mv,
+ xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static unsigned int check_mv_bounds(int_mv *mv,
+ int mb_to_left_edge,
+ int mb_to_right_edge,
+ int mb_to_top_edge,
+ int mb_to_bottom_edge) {
+ return (mv->as_mv.col < mb_to_left_edge) ||
+ (mv->as_mv.col > mb_to_right_edge) ||
+ (mv->as_mv.row < mb_to_top_edge) ||
+ (mv->as_mv.row > mb_to_bottom_edge);
+}
+
+void vp9_find_near_mvs(MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ const MODE_INFO *lfhere,
+ int_mv *nearest, int_mv *nearby, int_mv *best,
+ int near_mv_ref_cts[4],
+ int refframe,
+ int *ref_frame_sign_bias);
+
+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
+ vp9_prob p[VP9_MVREFS - 1],
+ const int near_mv_ref_ct[4]);
+
+extern const unsigned char vp9_mbsplit_offset[4][16];
+
+static int left_block_mv(const MODE_INFO *cur_mb, int b) {
+ if (!(b & 3)) {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if (cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv[0].as_int;
+ b += 4;
+ }
+
+ return (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+}
+
+static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {
+ if (!(b & 3)) {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if (cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
+ b += 4;
+ }
+
+ return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
+ if (!(b >> 2)) {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if (cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv[0].as_int;
+ b += 16;
+ }
+
+ return (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+}
+
+static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
+ if (!(b >> 2)) {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if (cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
+ b += 16;
+ }
+
+ return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;
+}
+
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
+ if (!(b & 3)) {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if (cur_mb->mbmi.mode < I8X8_PRED) {
+ return pred_mode_conv(cur_mb->mbmi.mode);
+ } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+ return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);
+ } else if (cur_mb->mbmi.mode == B_PRED) {
+ return ((cur_mb->bmi + 3 + b)->as_mode.first);
+ } else {
+ return B_DC_PRED;
+ }
+ }
+ return (cur_mb->bmi + b - 1)->as_mode.first;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
+ int b, int mi_stride) {
+ if (!(b >> 2)) {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if (cur_mb->mbmi.mode < I8X8_PRED) {
+ return pred_mode_conv(cur_mb->mbmi.mode);
+ } else if (cur_mb->mbmi.mode == I8X8_PRED) {
+ return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);
+ } else if (cur_mb->mbmi.mode == B_PRED) {
+ return ((cur_mb->bmi + 12 + b)->as_mode.first);
+ } else {
+ return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 4)->as_mode.first;
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/generic/systemdependent.c
@@ -1,0 +1,87 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/onyxc_int.h"
+
+extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);
+extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);
+
+void vp9_machine_specific_config(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+
+ rtcd->idct.idct1 = vp9_short_idct4x4llm_1_c;
+ rtcd->idct.idct16 = vp9_short_idct4x4llm_c;
+ rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
+ rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
+ rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
+ rtcd->idct.idct8 = vp9_short_idct8x8_c;
+ rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
+ rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c;
+ rtcd->idct.idct16x16 = vp9_short_idct16x16_c;
+
+ rtcd->subpix.eighttap16x16 = vp9_eighttap_predict16x16_c;
+ rtcd->subpix.eighttap8x8 = vp9_eighttap_predict8x8_c;
+ rtcd->subpix.eighttap_avg16x16 = vp9_eighttap_predict_avg16x16_c;
+ rtcd->subpix.eighttap_avg8x8 = vp9_eighttap_predict_avg8x8_c;
+ rtcd->subpix.eighttap_avg4x4 = vp9_eighttap_predict_avg4x4_c;
+ rtcd->subpix.eighttap8x4 = vp9_eighttap_predict8x4_c;
+ rtcd->subpix.eighttap4x4 = vp9_eighttap_predict_c;
+ rtcd->subpix.eighttap16x16_sharp = vp9_eighttap_predict16x16_sharp_c;
+ rtcd->subpix.eighttap8x8_sharp = vp9_eighttap_predict8x8_sharp_c;
+ rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;
+ rtcd->subpix.eighttap_avg8x8_sharp = vp9_eighttap_predict_avg8x8_sharp_c;
+ rtcd->subpix.eighttap_avg4x4_sharp = vp9_eighttap_predict_avg4x4_sharp_c;
+ rtcd->subpix.eighttap8x4_sharp = vp9_eighttap_predict8x4_sharp_c;
+ rtcd->subpix.eighttap4x4_sharp = vp9_eighttap_predict_sharp_c;
+
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_c;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_c;
+ rtcd->subpix.sixtap_avg16x16 = vp9_sixtap_predict_avg16x16_c;
+ rtcd->subpix.sixtap_avg8x8 = vp9_sixtap_predict_avg8x8_c;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_c;
+ rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_c;
+ rtcd->subpix.sixtap_avg4x4 = vp9_sixtap_predict_avg_c;
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_c;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_c;
+ rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;
+ rtcd->subpix.bilinear_avg8x8 = vp9_bilinear_predict_avg8x8_c;
+ rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_c;
+ rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_c;
+ rtcd->subpix.bilinear_avg4x4 = vp9_bilinear_predict_avg4x4_c;
+
+#if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS)
+ rtcd->postproc.down = vp9_mbpost_proc_down_c;
+ rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;
+ rtcd->postproc.downacross = vp9_post_proc_down_and_across_c;
+ rtcd->postproc.addnoise = vp9_plane_add_noise_c;
+ rtcd->postproc.blend_mb_inner = vp9_blend_mb_inner_c;
+ rtcd->postproc.blend_mb_outer = vp9_blend_mb_outer_c;
+ rtcd->postproc.blend_b = vp9_blend_b_c;
+#endif
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+ vp9_arch_x86_common_init(ctx);
+#endif
+
+#if ARCH_ARM
+ vp9_arch_arm_common_init(ctx);
+#endif
+
+ vpx_rtcd();
+}
--- /dev/null
+++ b/vp9/common/header.h
@@ -1,0 +1,42 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_HEADER_H
+#define __INC_HEADER_H
+
+/* 24 bits total */
+typedef struct {
+ unsigned int type: 1;
+ unsigned int version: 3;
+ unsigned int show_frame: 1;
+
+ /* Allow 2^20 bytes = 8 megabits for first partition */
+
+ unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+ unsigned int frame_number;
+ unsigned int update_gold: 1;
+ unsigned int uses_gold: 1;
+ unsigned int update_last: 1;
+ unsigned int uses_last: 1;
+#endif
+
+} VP9_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP9_HEADER_SIZE 8
+#else
+#define VP9_HEADER_SIZE 3
+#endif
+
+
+#endif
--- /dev/null
+++ b/vp9/common/idct.h
@@ -1,0 +1,144 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_IDCT_H
+#define __INC_IDCT_H
+
+#include "vp9/common/blockd.h"
+
+#define prototype_second_order(sym) \
+ void sym(short *input, short *output)
+
+#define prototype_idct(sym) \
+ void sym(short *input, short *output, int pitch)
+
+#define prototype_idct_scalar_add(sym) \
+ void sym(short input, \
+ unsigned char *pred, unsigned char *output, \
+ int pitch, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/idct_x86.h"
+#endif
+
+#ifdef _MSC_VER
+/* TODO: remove these after integer implmementations are done */
+#define M_PI 3.14159265358979323846
+#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))
+#endif
+
+
+#if ARCH_ARM
+#include "arm/idct_arm.h"
+#endif
+
+#if CONFIG_LOSSLESS
+#define WHT_UPSCALE_FACTOR 3
+#define Y2_WHT_UPSCALE_FACTOR 2
+#endif
+
+#ifndef vp9_idct_idct16x16
+#define vp9_idct_idct16x16 vp9_short_idct16x16_c
+#endif
+extern prototype_idct(vp9_idct_idct16x16);
+
+#ifndef vp9_idct_idct8
+#define vp9_idct_idct8 vp9_short_idct8x8_c
+#endif
+extern prototype_idct(vp9_idct_idct8);
+
+#ifndef vp9_idct_idct8_1
+#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
+#endif
+extern prototype_idct(vp9_idct_idct8_1);
+
+#ifndef vp9_idct_ihaar2
+#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c
+#endif
+extern prototype_idct(vp9_idct_ihaar2);
+
+#ifndef vp9_idct_ihaar2_1
+#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c
+#endif
+extern prototype_idct(vp9_idct_ihaar2_1);
+
+#ifndef vp9_idct_idct1_scalar_add_8x8
+#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c
+#endif
+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);
+
+
+
+#ifndef vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c
+#endif
+extern prototype_idct(vp9_idct_idct1);
+
+#ifndef vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_c
+#endif
+extern prototype_idct(vp9_idct_idct16);
+
+#ifndef vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c
+#endif
+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);
+
+
+#ifndef vp9_idct_iwalsh1
+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c
+#endif
+extern prototype_second_order(vp9_idct_iwalsh1);
+
+#ifndef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c
+#endif
+extern prototype_second_order(vp9_idct_iwalsh16);
+
+#if CONFIG_LOSSLESS
+extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);
+extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);
+extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);
+extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);
+#endif
+
+void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
+ TX_TYPE tx_type, int tx_dim);
+
+typedef prototype_idct((*vp9_idct_fn_t));
+typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));
+typedef prototype_second_order((*vp9_second_order_fn_t));
+
+typedef struct {
+ vp9_idct_fn_t idct1;
+ vp9_idct_fn_t idct16;
+ vp9_idct_scalar_add_fn_t idct1_scalar_add;
+
+ vp9_second_order_fn_t iwalsh1;
+ vp9_second_order_fn_t iwalsh16;
+
+ vp9_idct_fn_t idct8;
+ vp9_idct_fn_t idct8_1;
+ vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
+ vp9_idct_fn_t ihaar2;
+ vp9_idct_fn_t ihaar2_1;
+
+ vp9_idct_fn_t idct16x16;
+} vp9_idct_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IDCT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/idctllm.c
@@ -1,0 +1,1275 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+#include <assert.h>
+#include <math.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/systemdependent.h"
+
+#include "vp9/common/blockd.h"
+
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+static const int rounding = 0;
+
+// TODO: these transforms can be further converted into integer forms
+// for complexity optimization
+static const float idct_4[16] = {
+ 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.270598050073099,
+ 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.653281482438188,
+ 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.653281482438188,
+ 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.270598050073099
+};
+
+static const float iadst_4[16] = {
+ 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.428525073124360,
+ 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.656538502008139,
+ 0.577350269189626, 0, -0.577350269189626, 0.577350269189626,
+ 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.228013428883779
+};
+
+static const float idct_8[64] = {
+ 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.415734806151273,
+ 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.097545161008064,
+ 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.097545161008064,
+ -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.277785116509801,
+ 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.490392640201615,
+ -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.415734806151273,
+ 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.277785116509801,
+ 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.490392640201615,
+ 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.277785116509801,
+ 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.490392640201615,
+ 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.490392640201615,
+ -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.415734806151273,
+ 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.097545161008064,
+ -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.277785116509801,
+ 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.415734806151273,
+ 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.097545161008064
+};
+
+static const float iadst_8[64] = {
+ 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.466553967085785,
+ 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.175227946595735,
+ 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.255357107325376,
+ -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.326790388032145,
+ 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.326790388032145,
+ -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.434217976756762,
+ 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.434217976756762,
+ 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.483002021635509,
+ 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.089131608307533,
+ 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.466553967085785,
+ 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.483002021635509,
+ -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.387095214016348,
+ 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.175227946595735,
+ -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.255357107325376,
+ 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.387095214016348,
+ 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.089131608307532
+};
+
+static const int16_t idct_i4[16] = {
+ 8192, 10703, 8192, 4433,
+ 8192, 4433, -8192, -10703,
+ 8192, -4433, -8192, 10703,
+ 8192, -10703, 8192, -4433
+};
+
+static const int16_t iadst_i4[16] = {
+ 3736, 9459, 10757, 7021,
+ 7021, 9459, -3736, -10757,
+ 9459, 0, -9459, 9459,
+ 10757, -9459, 7021, -3736
+};
+
+static const int16_t idct_i8[64] = {
+ 5793, 8035, 7568, 6811,
+ 5793, 4551, 3135, 1598,
+ 5793, 6811, 3135, -1598,
+ -5793, -8035, -7568, -4551,
+ 5793, 4551, -3135, -8035,
+ -5793, 1598, 7568, 6811,
+ 5793, 1598, -7568, -4551,
+ 5793, 6811, -3135, -8035,
+ 5793, -1598, -7568, 4551,
+ 5793, -6811, -3135, 8035,
+ 5793, -4551, -3135, 8035,
+ -5793, -1598, 7568, -6811,
+ 5793, -6811, 3135, 1598,
+ -5793, 8035, -7568, 4551,
+ 5793, -8035, 7568, -6811,
+ 5793, -4551, 3135, -1598
+};
+
+static const int16_t iadst_i8[64] = {
+ 1460, 4184, 6342, 7644,
+ 7914, 7114, 5354, 2871,
+ 2871, 7114, 7644, 4184,
+ -1460, -6342, -7914, -5354,
+ 4184, 7914, 2871, -5354,
+ -7644, -1460, 6342, 7114,
+ 5354, 6342, -4184, -7114,
+ 2871, 7644, -1460, -7914,
+ 6342, 2871, -7914, 1460,
+ 7114, -5354, -4184, 7644,
+ 7114, -1460, -5354, 7914,
+ -4184, -2871, 7644, -6342,
+ 7644, -5354, 1460, 2871,
+ -6342, 7914, -7114, 4184,
+ 7914, -7644, 7114, -6342,
+ 5354, -4184, 2871, -1460
+};
+
+static float idct_16[256] = {
+ 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0.273300,
+ 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0.034654,
+ 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0.166664,
+ -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,
+ 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0.338330,
+ -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0.166664,
+ 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0.034654,
+ 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0.224292,
+ 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0.351851,
+ 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0.273300,
+ 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0.102631,
+ -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0.311806,
+ 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0.311806,
+ -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0.338330,
+ 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0.224292,
+ 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0.351851,
+ 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0.224292,
+ 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0.351851,
+ 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0.311806,
+ -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0.338330,
+ 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0.102631,
+ -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0.311806,
+ 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0.351851,
+ 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0.273300,
+ 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0.034654,
+ 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0.224292,
+ 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0.338330,
+ -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0.166664,
+ 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0.166664,
+ -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0.102631,
+ 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0.273300,
+ 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0.034654
+};
+
+static float iadst_16[256] = {
+ 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0.344612,
+ 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0.065889,
+ 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0.098087,
+ -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,
+ 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0.316693,
+ -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0.188227,
+ 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0.188227,
+ 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0.240255,
+ 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0.263118,
+ 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0.283599,
+ 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0.263118,
+ -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0.316693,
+ 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0.188227,
+ -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0.338341,
+ 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0.316693,
+ 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0.347761,
+ 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0.098087,
+ 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0.344612,
+ 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0.344612,
+ -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0.329007,
+ 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000,
+ -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511,
+ 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0.344612,
+ 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0.263118,
+ 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0.098087,
+ 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0.215215,
+ 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0.316693,
+ -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0.159534,
+ 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0.188227,
+ -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0.098087,
+ 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0.263118,
+ 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0.033094
+};
+
+static const int16_t idct_i16[256] = {
+ 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478,
+ 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568,
+ 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731,
+ -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
+ 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543,
+ -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731,
+ 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568,
+ 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675,
+ 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765,
+ 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478,
+ 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682,
+ -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109,
+ 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109,
+ -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543,
+ 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675,
+ 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765,
+ 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675,
+ 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765,
+ 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109,
+ -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543,
+ 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682,
+ -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109,
+ 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765,
+ 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478,
+ 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568,
+ 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675,
+ 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543,
+ -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731,
+ 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731,
+ -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682,
+ 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478,
+ 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568
+};
+
+static const int16_t iadst_i16[256] = {
+ 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646,
+ 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080,
+ 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607,
+ -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
+ 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189,
+ -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084,
+ 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084,
+ 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936,
+ 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311,
+ 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646,
+ 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311,
+ -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189,
+ 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084,
+ -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543,
+ 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189,
+ 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698,
+ 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607,
+ 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646,
+ 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646,
+ -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390,
+ 4940, 0, -4940, 4940, 0, -4940, 4940, 0,
+ -4940, 4940, 0, -4940, 4940, 0, -4940, 4940,
+ 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646,
+ 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311,
+ 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607,
+ 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526,
+ 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,
+ -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,
+ 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,
+ -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,
+ 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,
+ 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542
+};
+
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp9_ihtllm_int_c vp9_ihtllm_c
+#else
+#define vp9_ihtllm_float_c vp9_ihtllm_c
+#endif
+
+void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
+ TX_TYPE tx_type, int tx_dim) {
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ int i, j, k;
+ float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+ // the implementation could be simplified in
+ // conjunction with integer transform
+ const int16_t *ip = input;
+ int16_t *op = output;
+ int shortpitch = pitch >> 1;
+
+ float *pfa = &bufa[0];
+ float *pfb = &bufb[0];
+
+ // pointers to vertical and horizontal transforms
+ const float *ptv, *pth;
+
+ assert(tx_type != DCT_DCT);
+ // load and convert residual array into floating-point
+ for(j = 0; j < tx_dim; j++) {
+ for(i = 0; i < tx_dim; i++) {
+ pfa[i] = (float)ip[i];
+ }
+ pfa += tx_dim;
+ ip += tx_dim;
+ }
+
+ // vertical transformation
+ pfa = &bufa[0];
+ pfb = &bufb[0];
+
+ switch(tx_type) {
+ case ADST_ADST :
+ case ADST_DCT :
+ ptv = (tx_dim == 4) ? &iadst_4[0] :
+ ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+ break;
+
+ default :
+ ptv = (tx_dim == 4) ? &idct_4[0] :
+ ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+ break;
+ }
+
+ for(j = 0; j < tx_dim; j++) {
+ for(i = 0; i < tx_dim; i++) {
+ pfb[i] = 0 ;
+ for(k = 0; k < tx_dim; k++) {
+ pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+ }
+ pfa += 1;
+ }
+
+ pfb += tx_dim;
+ ptv += tx_dim;
+ pfa = &bufa[0];
+ }
+
+ // horizontal transformation
+ pfa = &bufa[0];
+ pfb = &bufb[0];
+
+ switch(tx_type) {
+ case ADST_ADST :
+ case DCT_ADST :
+ pth = (tx_dim == 4) ? &iadst_4[0] :
+ ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+ break;
+
+ default :
+ pth = (tx_dim == 4) ? &idct_4[0] :
+ ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+ break;
+ }
+
+ for(j = 0; j < tx_dim; j++) {
+ for(i = 0; i < tx_dim; i++) {
+ pfa[i] = 0;
+ for(k = 0; k < tx_dim; k++) {
+ pfa[i] += pfb[k] * pth[k];
+ }
+ pth += tx_dim;
+ }
+
+ pfa += tx_dim;
+ pfb += tx_dim;
+
+ switch(tx_type) {
+ case ADST_ADST :
+ case DCT_ADST :
+ pth = (tx_dim == 4) ? &iadst_4[0] :
+ ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
+ break;
+
+ default :
+ pth = (tx_dim == 4) ? &idct_4[0] :
+ ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
+ break;
+ }
+ }
+
+ // convert to short integer format and load BLOCKD buffer
+ op = output;
+ pfa = &bufa[0];
+
+ for(j = 0; j < tx_dim; j++) {
+ for(i = 0; i < tx_dim; i++) {
+ op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
+ -(int16_t)( - pfa[i] / 8 + 0.49);
+ }
+
+ op += shortpitch;
+ pfa += tx_dim;
+ }
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 14 // 16
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 17 // 15
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,
+ TX_TYPE tx_type, int tx_dim) {
+ int i, j, k;
+ int16_t imbuf[256];
+
+ const int16_t *ip = input;
+ int16_t *op = output;
+ int16_t *im = &imbuf[0];
+
+ /* pointers to vertical and horizontal transforms. */
+ const int16_t *ptv = NULL, *pth = NULL;
+ int shortpitch = pitch >> 1;
+
+ switch (tx_type) {
+ case ADST_ADST :
+ ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
+ : ((tx_dim == 8) ? &iadst_i8[0]
+ : &iadst_i16[0]);
+ break;
+ case ADST_DCT :
+ ptv = (tx_dim == 4) ? &iadst_i4[0]
+ : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+ pth = (tx_dim == 4) ? &idct_i4[0]
+ : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+ break;
+ case DCT_ADST :
+ ptv = (tx_dim == 4) ? &idct_i4[0]
+ : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
+ pth = (tx_dim == 4) ? &iadst_i4[0]
+ : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
+ break;
+ case DCT_DCT :
+ ptv = pth = (tx_dim == 4) ? &idct_i4[0]
+ : ((tx_dim == 8) ? &idct_i8[0]
+ : &idct_i16[0]);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ /* vertical transformation */
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ int temp = 0;
+
+ for (k = 0; k < tx_dim; k++) {
+ temp += ptv[k] * ip[(k * tx_dim)];
+ }
+
+ im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+ ip++;
+ }
+ im += tx_dim; // 16
+ ptv += tx_dim;
+ ip = input;
+ }
+
+ /* horizontal transformation */
+ im = &imbuf[0];
+
+ for (j = 0; j < tx_dim; j++) {
+ const int16_t *pthc = pth;
+
+ for (i = 0; i < tx_dim; i++) {
+ int temp = 0;
+
+ for (k = 0; k < tx_dim; k++) {
+ temp += im[k] * pthc[k];
+ }
+
+ op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+ pthc += tx_dim;
+ }
+
+ im += tx_dim; // 16
+ op += shortpitch;
+ }
+}
+
+void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = pitch >> 1;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch * 0] = a1 + d1;
+ op[shortpitch * 3] = a1 - d1;
+
+ op[shortpitch * 1] = b1 + c1;
+ op[shortpitch * 2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+ op[0] = (a1 + d1 + 16) >> 5;
+ op[3] = (a1 - d1 + 16) >> 5;
+
+ op[1] = (b1 + c1 + 16) >> 5;
+ op[2] = (b1 - c1 + 16) >> 5;
+
+ ip += shortpitch;
+ op += shortpitch;
+ }
+}
+
+void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
+ int i;
+ int a1;
+ short *op = output;
+ int shortpitch = pitch >> 1;
+ a1 = ((input[0] + 16) >> 5);
+ for (i = 0; i < 4; i++) {
+ op[0] = a1;
+ op[1] = a1;
+ op[2] = a1;
+ op[3] = a1;
+ op += shortpitch;
+ }
+}
+
+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ unsigned char *dst_ptr, int pitch, int stride) {
+ int a1 = ((input_dc + 16) >> 5);
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = a1 + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ pred_ptr += pitch;
+ }
+}
+
+void vp9_short_inv_walsh4x4_c(short *input, short *output) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ((ip[0] + ip[3]));
+ b1 = ((ip[1] + ip[2]));
+ c1 = ((ip[1] - ip[2]));
+ d1 = ((ip[0] - ip[3]));
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[1] = (c1 + d1) >> 1;
+ op[2] = (a1 - b1) >> 1;
+ op[3] = (d1 - c1) >> 1;
+
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[4] = (c1 + d1) >> 1;
+ op[8] = (a1 - b1) >> 1;
+ op[12] = (d1 - c1) >> 1;
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
+ int i;
+ short tmp[4];
+ short *ip = in;
+ short *op = tmp;
+
+ op[0] = (ip[0] + 1) >> 1;
+ op[1] = op[2] = op[3] = (ip[0] >> 1);
+
+ ip = tmp;
+ op = out;
+ for (i = 0; i < 4; i++) {
+ op[0] = (ip[0] + 1) >> 1;
+ op[4] = op[8] = op[12] = (ip[0] >> 1);
+ ip++;
+ op++;
+ }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
+ b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
+ c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
+ d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[1] = (c1 + d1) >> 1;
+ op[2] = (a1 - b1) >> 1;
+ op[3] = (d1 - c1) >> 1;
+
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+
+ op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
+ int i;
+ short tmp[4];
+ short *ip = in;
+ short *op = tmp;
+
+ op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
+ op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
+
+ ip = tmp;
+ op = out;
+ for (i = 0; i < 4; i++) {
+ op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+ int shortpitch = pitch >> 1;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
+ b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
+ c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
+ d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[1] = (c1 + d1) >> 1;
+ op[2] = (a1 - b1) >> 1;
+ op[3] = (d1 - c1) >> 1;
+
+ ip += 4;
+ op += shortpitch;
+ }
+
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
+ b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
+ c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
+ d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
+
+
+ op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
+ op[shortpitch * 1] = (c1 + d1) >> 1;
+ op[shortpitch * 2] = (a1 - b1) >> 1;
+ op[shortpitch * 3] = (d1 - c1) >> 1;
+
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
+ int i;
+ short tmp[4];
+ short *ip = in;
+ short *op = tmp;
+ int shortpitch = pitch >> 1;
+
+ op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
+ op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
+
+
+ ip = tmp;
+ op = out;
+ for (i = 0; i < 4; i++) {
+ op[shortpitch * 0] = (ip[0] + 1) >> 1;
+ op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
+ ip++;
+ op++;
+ }
+}
+
+void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
+ unsigned char *dst_ptr,
+ int pitch, int stride) {
+ int r, c;
+ short tmp[16];
+ vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = tmp[r * 4 + c] + pred_ptr[c];
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ pred_ptr += pitch;
+ }
+}
+#endif
+
+void vp9_dc_only_idct_add_8x8_c(short input_dc,
+ unsigned char *pred_ptr,
+ unsigned char *dst_ptr,
+ int pitch, int stride) {
+ int a1 = ((input_dc + 16) >> 5);
+ int r, c, b;
+ unsigned char *orig_pred = pred_ptr;
+ unsigned char *orig_dst = dst_ptr;
+ for (b = 0; b < 4; b++) {
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = a1 + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ pred_ptr += pitch;
+ }
+ dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
+ pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
+ }
+}
+
+#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
+
+/* row (horizontal) IDCT
+ *
+ * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- *
+ * ( k + - ) * l ) l=0 8 2
+ *
+ * where: c[0] = 128 c[1..7] = 128*sqrt(2) */
+
+static void idctrow(int *blk) {
+ int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+ /* shortcut */
+ if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+ (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+ blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
+ = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+ return;
+ }
+
+ x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
+ /* first stage */
+ x8 = W7 * (x4 + x5);
+ x4 = x8 + (W1 - W7) * x4;
+ x5 = x8 - (W1 + W7) * x5;
+ x8 = W3 * (x6 + x7);
+ x6 = x8 - (W3 - W5) * x6;
+ x7 = x8 - (W3 + W5) * x7;
+
+ /* second stage */
+ x8 = x0 + x1;
+ x0 -= x1;
+ x1 = W6 * (x3 + x2);
+ x2 = x1 - (W2 + W6) * x2;
+ x3 = x1 + (W2 - W6) * x3;
+ x1 = x4 + x6;
+ x4 -= x6;
+ x6 = x5 + x7;
+ x5 -= x7;
+
+ /* third stage */
+ x7 = x8 + x3;
+ x8 -= x3;
+ x3 = x0 + x2;
+ x0 -= x2;
+ x2 = (181 * (x4 + x5) + 128) >> 8;
+ x4 = (181 * (x4 - x5) + 128) >> 8;
+
+ /* fourth stage */
+ blk[0] = (x7 + x1) >> 8;
+ blk[1] = (x3 + x2) >> 8;
+ blk[2] = (x0 + x4) >> 8;
+ blk[3] = (x8 + x6) >> 8;
+ blk[4] = (x8 - x6) >> 8;
+ blk[5] = (x0 - x4) >> 8;
+ blk[6] = (x3 - x2) >> 8;
+ blk[7] = (x7 - x1) >> 8;
+}
+
+/* column (vertical) IDCT
+ *
+ * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] *
+ * cos( -- * ( k + - ) * l ) l=0 8 2
+ *
+ * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
+static void idctcol(int *blk) {
+ int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+ /* shortcut */
+ if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+ (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
+ (x7 = blk[8 * 3]))) {
+ blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
+ = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+ = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+ return;
+ }
+
+ x0 = (blk[8 * 0] << 8) + 16384;
+
+ /* first stage */
+ x8 = W7 * (x4 + x5) + 4;
+ x4 = (x8 + (W1 - W7) * x4) >> 3;
+ x5 = (x8 - (W1 + W7) * x5) >> 3;
+ x8 = W3 * (x6 + x7) + 4;
+ x6 = (x8 - (W3 - W5) * x6) >> 3;
+ x7 = (x8 - (W3 + W5) * x7) >> 3;
+
+ /* second stage */
+ x8 = x0 + x1;
+ x0 -= x1;
+ x1 = W6 * (x3 + x2) + 4;
+ x2 = (x1 - (W2 + W6) * x2) >> 3;
+ x3 = (x1 + (W2 - W6) * x3) >> 3;
+ x1 = x4 + x6;
+ x4 -= x6;
+ x6 = x5 + x7;
+ x5 -= x7;
+
+ /* third stage */
+ x7 = x8 + x3;
+ x8 -= x3;
+ x3 = x0 + x2;
+ x0 -= x2;
+ x2 = (181 * (x4 + x5) + 128) >> 8;
+ x4 = (181 * (x4 - x5) + 128) >> 8;
+
+ /* fourth stage */
+ blk[8 * 0] = (x7 + x1) >> 14;
+ blk[8 * 1] = (x3 + x2) >> 14;
+ blk[8 * 2] = (x0 + x4) >> 14;
+ blk[8 * 3] = (x8 + x6) >> 14;
+ blk[8 * 4] = (x8 - x6) >> 14;
+ blk[8 * 5] = (x0 - x4) >> 14;
+ blk[8 * 6] = (x3 - x2) >> 14;
+ blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+#define TX_DIM 8
+void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
+ int X[TX_DIM * TX_DIM];
+ int i, j;
+ int shortpitch = pitch >> 1;
+
+ for (i = 0; i < TX_DIM; i++) {
+ for (j = 0; j < TX_DIM; j++) {
+ X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+ + (coefs[i * TX_DIM + j] < 0)) >> 2;
+ }
+ }
+ for (i = 0; i < 8; i++)
+ idctrow(X + 8 * i);
+
+ for (i = 0; i < 8; i++)
+ idctcol(X + i);
+
+ for (i = 0; i < TX_DIM; i++) {
+ for (j = 0; j < TX_DIM; j++) {
+ block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
+ }
+ }
+}
+
+
+void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
+ int i;
+ short *ip = input; // 0,1, 4, 8
+ short *op = output;
+ for (i = 0; i < 16; i++) {
+ op[i] = 0;
+ }
+
+ op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
+ op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
+ op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
+ op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
+}
+
+
+#if 0
+// Keep a really bad float version as reference for now.
+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ double x;
+ const int short_pitch = pitch >> 1;
+ int i, j, k, l;
+ for (l = 0; l < 16; ++l) {
+ for (k = 0; k < 16; ++k) {
+ double s = 0;
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) {
+ x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
+ if (i != 0)
+ x *= sqrt(2.0);
+ if (j != 0)
+ x *= sqrt(2.0);
+ s += x;
+ }
+ }
+ output[k*short_pitch+l] = (short)round(s);
+ }
+ }
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+#endif
+
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;
+
+
+static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
+
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ double step[16];
+ double intermediate[16];
+ double temp1, temp2;
+
+
+ // step 1 and 2
+ step[ 0] = input[0] + input[8];
+ step[ 1] = input[0] - input[8];
+
+ temp1 = input[4]*C12;
+ temp2 = input[12]*C4;
+
+ temp1 -= temp2;
+ temp1 *= C8;
+
+ step[ 2] = 2*(temp1);
+
+ temp1 = input[4]*C4;
+ temp2 = input[12]*C12;
+ temp1 += temp2;
+ temp1 = (temp1);
+ temp1 *= C8;
+ step[ 3] = 2*(temp1);
+
+ temp1 = input[2]*C8;
+ temp1 = 2*(temp1);
+ temp2 = input[6] + input[10];
+
+ step[ 4] = temp1 + temp2;
+ step[ 5] = temp1 - temp2;
+
+ temp1 = input[14]*C8;
+ temp1 = 2*(temp1);
+ temp2 = input[6] - input[10];
+
+ step[ 6] = temp2 - temp1;
+ step[ 7] = temp2 + temp1;
+
+ // for odd input
+ temp1 = input[3]*C12;
+ temp2 = input[13]*C4;
+ temp1 += temp2;
+ temp1 = (temp1);
+ temp1 *= C8;
+ intermediate[ 8] = 2*(temp1);
+
+ temp1 = input[3]*C4;
+ temp2 = input[13]*C12;
+ temp2 -= temp1;
+ temp2 = (temp2);
+ temp2 *= C8;
+ intermediate[ 9] = 2*(temp2);
+
+ intermediate[10] = 2*(input[9]*C8);
+ intermediate[11] = input[15] - input[1];
+ intermediate[12] = input[15] + input[1];
+ intermediate[13] = 2*((input[7]*C8));
+
+ temp1 = input[11]*C12;
+ temp2 = input[5]*C4;
+ temp2 -= temp1;
+ temp2 = (temp2);
+ temp2 *= C8;
+ intermediate[14] = 2*(temp2);
+
+ temp1 = input[11]*C4;
+ temp2 = input[5]*C12;
+ temp1 += temp2;
+ temp1 = (temp1);
+ temp1 *= C8;
+ intermediate[15] = 2*(temp1);
+
+ step[ 8] = intermediate[ 8] + intermediate[14];
+ step[ 9] = intermediate[ 9] + intermediate[15];
+ step[10] = intermediate[10] + intermediate[11];
+ step[11] = intermediate[10] - intermediate[11];
+ step[12] = intermediate[12] + intermediate[13];
+ step[13] = intermediate[12] - intermediate[13];
+ step[14] = intermediate[ 8] - intermediate[14];
+ step[15] = intermediate[ 9] - intermediate[15];
+
+ // step 3
+ output[0] = step[ 0] + step[ 3];
+ output[1] = step[ 1] + step[ 2];
+ output[2] = step[ 1] - step[ 2];
+ output[3] = step[ 0] - step[ 3];
+
+ temp1 = step[ 4]*C14;
+ temp2 = step[ 7]*C2;
+ temp1 -= temp2;
+ output[4] = (temp1);
+
+ temp1 = step[ 4]*C2;
+ temp2 = step[ 7]*C14;
+ temp1 += temp2;
+ output[7] = (temp1);
+
+ temp1 = step[ 5]*C10;
+ temp2 = step[ 6]*C6;
+ temp1 -= temp2;
+ output[5] = (temp1);
+
+ temp1 = step[ 5]*C6;
+ temp2 = step[ 6]*C10;
+ temp1 += temp2;
+ output[6] = (temp1);
+
+ output[8] = step[ 8] + step[11];
+ output[9] = step[ 9] + step[10];
+ output[10] = step[ 9] - step[10];
+ output[11] = step[ 8] - step[11];
+ output[12] = step[12] + step[15];
+ output[13] = step[13] + step[14];
+ output[14] = step[13] - step[14];
+ output[15] = step[12] - step[15];
+
+ // output 4
+ step[ 0] = output[0] + output[7];
+ step[ 1] = output[1] + output[6];
+ step[ 2] = output[2] + output[5];
+ step[ 3] = output[3] + output[4];
+ step[ 4] = output[3] - output[4];
+ step[ 5] = output[2] - output[5];
+ step[ 6] = output[1] - output[6];
+ step[ 7] = output[0] - output[7];
+
+ temp1 = output[8]*C7;
+ temp2 = output[15]*C9;
+ temp1 -= temp2;
+ step[ 8] = (temp1);
+
+ temp1 = output[9]*C11;
+ temp2 = output[14]*C5;
+ temp1 += temp2;
+ step[ 9] = (temp1);
+
+ temp1 = output[10]*C3;
+ temp2 = output[13]*C13;
+ temp1 -= temp2;
+ step[10] = (temp1);
+
+ temp1 = output[11]*C15;
+ temp2 = output[12]*C1;
+ temp1 += temp2;
+ step[11] = (temp1);
+
+ temp1 = output[11]*C1;
+ temp2 = output[12]*C15;
+ temp2 -= temp1;
+ step[12] = (temp2);
+
+ temp1 = output[10]*C13;
+ temp2 = output[13]*C3;
+ temp1 += temp2;
+ step[13] = (temp1);
+
+ temp1 = output[9]*C5;
+ temp2 = output[14]*C11;
+ temp2 -= temp1;
+ step[14] = (temp2);
+
+ temp1 = output[8]*C9;
+ temp2 = output[15]*C7;
+ temp1 += temp2;
+ step[15] = (temp1);
+
+ // step 5
+ output[0] = (step[0] + step[15]);
+ output[1] = (step[1] + step[14]);
+ output[2] = (step[2] + step[13]);
+ output[3] = (step[3] + step[12]);
+ output[4] = (step[4] + step[11]);
+ output[5] = (step[5] + step[10]);
+ output[6] = (step[6] + step[ 9]);
+ output[7] = (step[7] + step[ 8]);
+
+ output[15] = (step[0] - step[15]);
+ output[14] = (step[1] - step[14]);
+ output[13] = (step[2] - step[13]);
+ output[12] = (step[3] - step[12]);
+ output[11] = (step[4] - step[11]);
+ output[10] = (step[5] - step[10]);
+ output[9] = (step[6] - step[ 9]);
+ output[8] = (step[7] - step[ 8]);
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+// Remove once an int version of iDCT is written
+#if 0
+void reference_16x16_idct_1d(double input[16], double output[16]) {
+
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ const double kPi = 3.141592653589793238462643383279502884;
+ const double kSqrt2 = 1.414213562373095048801688724209698;
+ for (int k = 0; k < 16; k++) {
+ output[k] = 0.0;
+ for (int n = 0; n < 16; n++) {
+ output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
+ if (n == 0)
+ output[k] = output[k]/kSqrt2;
+ }
+ }
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+#endif
+
+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ double out[16*16], out2[16*16];
+ const int short_pitch = pitch >> 1;
+ int i, j;
+ // First transform rows
+ for (i = 0; i < 16; ++i) {
+ double temp_in[16], temp_out[16];
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = input[j + i*short_pitch];
+ butterfly_16x16_idct_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ out[j + i*16] = temp_out[j];
+ }
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ double temp_in[16], temp_out[16];
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ butterfly_16x16_idct_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ out2[j*16 + i] = temp_out[j];
+ }
+ for (i = 0; i < 16*16; ++i)
+ output[i] = round(out2[i]/128);
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
--- /dev/null
+++ b/vp9/common/implicit_segmentation.c
@@ -1,0 +1,255 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/onyxc_int.h"
+
+#define MAX_REGIONS 24000
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define min_mbs_in_region 3
+
+// this linked list structure holds equivalences for connected
+// component labeling
+struct list_el {
+ int label;
+ int seg_value;
+ int count;
+ struct list_el *next;
+};
+typedef struct list_el item;
+
+// connected colorsegments
+typedef struct {
+ int min_x;
+ int min_y;
+ int max_x;
+ int max_y;
+ long long sum_x;
+ long long sum_y;
+ int pixels;
+ int seg_value;
+ int label;
+} segment_info;
+
+
+typedef enum {
+ SEGMENT_MODE,
+ SEGMENT_MV,
+ SEGMENT_REFFRAME,
+ SEGMENT_SKIPPED
+} SEGMENT_TYPE;
+
+
+// this merges the two equivalence lists and
+// then makes sure that every label points to the same
+// equivalence list
+void merge(item *labels, int u, int v) {
+ item *a = labels[u].next;
+ item *b = labels[v].next;
+ item c;
+ item *it = &c;
+ int count;
+
+ // check if they are already merged
+ if (u == v || a == b)
+ return;
+
+ count = a->count + b->count;
+
+ // merge 2 sorted linked lists.
+ while (a != NULL && b != NULL) {
+ if (a->label < b->label) {
+ it->next = a;
+ a = a->next;
+ } else {
+ it->next = b;
+ b = b->next;
+ }
+
+ it = it->next;
+ }
+
+ if (a == NULL)
+ it->next = b;
+ else
+ it->next = a;
+
+ it = c.next;
+
+ // make sure every equivalence in the linked list points to this new ll
+ while (it != NULL) {
+ labels[it->label].next = c.next;
+ it = it->next;
+ }
+ c.next->count = count;
+
+}
+
+void segment_via_mode_info(VP9_COMMON *oci, int how) {
+ MODE_INFO *mi = oci->mi;
+ int i, j;
+ int mb_index = 0;
+
+ int label = 1;
+ int pitch = oci->mb_cols;
+
+ // holds linked list equivalences
+ // the max should probably be allocated at a higher level in oci
+ item equivalences[MAX_REGIONS];
+ int eq_ptr = 0;
+ item labels[MAX_REGIONS];
+ segment_info segments[MAX_REGIONS];
+ int label_count = 1;
+ int labeling[400 * 300];
+ int *lp = labeling;
+
+ label_count = 1;
+ memset(labels, 0, sizeof(labels));
+ memset(segments, 0, sizeof(segments));
+
+ /* Go through each macroblock first pass labelling */
+ for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+ for (j = 0; j < oci->mb_cols; j++) {
+ // int above seg_value, left seg_value, this seg_value...
+ int a = -1, l = -1, n = -1;
+
+ // above label, left label
+ int al = -1, ll = -1;
+ if (i) {
+ al = lp[j - pitch];
+ a = labels[al].next->seg_value;
+ }
+ if (j) {
+ ll = lp[j - 1];
+ l = labels[ll].next->seg_value;
+ }
+
+ // what setting are we going to do the implicit segmentation on
+ switch (how) {
+ case SEGMENT_MODE:
+ n = mi[mb_index].mbmi.mode;
+ break;
+ case SEGMENT_MV:
+ n = mi[mb_index].mbmi.mv[0].as_int;
+ if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)
+ n = -9999999;
+ break;
+ case SEGMENT_REFFRAME:
+ n = mi[mb_index].mbmi.ref_frame;
+ break;
+ case SEGMENT_SKIPPED:
+ n = mi[mb_index].mbmi.mb_skip_coeff;
+ break;
+ }
+
+ // above and left both have the same seg_value
+ if (n == a && n == l) {
+ // pick the lowest label
+ lp[j] = (al < ll ? al : ll);
+ labels[lp[j]].next->count++;
+
+ // merge the above and left equivalencies
+ merge(labels, al, ll);
+ }
+ // this matches above seg_value
+ else if (n == a) {
+ // give it the same label as above
+ lp[j] = al;
+ labels[al].next->count++;
+ }
+ // this matches left seg_value
+ else if (n == l) {
+ // give it the same label as above
+ lp[j] = ll;
+ labels[ll].next->count++;
+ } else {
+ // new label doesn't match either
+ item *e = &labels[label];
+ item *nl = &equivalences[eq_ptr++];
+ lp[j] = label;
+ nl->label = label;
+ nl->next = 0;
+ nl->seg_value = n;
+ nl->count = 1;
+ e->next = nl;
+ label++;
+ }
+ mb_index++;
+ }
+ mb_index++;
+ }
+ lp = labeling;
+
+ // give new labels to regions
+ for (i = 1; i < label; i++)
+ if (labels[i].next->count > min_mbs_in_region && labels[labels[i].next->label].label == 0) {
+ segment_info *cs = &segments[label_count];
+ cs->label = label_count;
+ labels[labels[i].next->label].label = label_count++;
+ labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
+ cs->seg_value = labels[labels[i].next->label].seg_value;
+ cs->min_x = oci->mb_cols;
+ cs->min_y = oci->mb_rows;
+ cs->max_x = 0;
+ cs->max_y = 0;
+ cs->sum_x = 0;
+ cs->sum_y = 0;
+ cs->pixels = 0;
+
+ }
+ lp = labeling;
+
+ // this is just to gather stats...
+ for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+ for (j = 0; j < oci->mb_cols; j++) {
+ segment_info *cs;
+ int oldlab = labels[lp[j]].next->label;
+ int lab = labels[oldlab].label;
+ lp[j] = lab;
+
+ cs = &segments[lab];
+
+ cs->min_x = (j < cs->min_x ? j : cs->min_x);
+ cs->max_x = (j > cs->max_x ? j : cs->max_x);
+ cs->min_y = (i < cs->min_y ? i : cs->min_y);
+ cs->max_y = (i > cs->max_y ? i : cs->max_y);
+ cs->sum_x += j;
+ cs->sum_y += i;
+ cs->pixels++;
+
+ lp[j] = lab;
+ mb_index++;
+ }
+ mb_index++;
+ }
+
+ {
+ lp = labeling;
+ printf("labelling \n");
+ mb_index = 0;
+ for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
+ for (j = 0; j < oci->mb_cols; j++) {
+ printf("%4d", lp[j]);
+ }
+ printf(" ");
+ for (j = 0; j < oci->mb_cols; j++, mb_index++) {
+ // printf("%3d",mi[mb_index].mbmi.mode );
+ printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
+ mi[mb_index].mbmi.mv[0].as_mv.col);
+ }
+ printf("\n");
+ ++mb_index;
+ }
+ printf("\n");
+ }
+}
+
--- /dev/null
+++ b/vp9/common/invtrans.c
@@ -1,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "invtrans.h"
+
+static void recon_dcblock(MACROBLOCKD *xd) {
+ BLOCKD *b = &xd->block[24];
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ xd->block[i].dqcoeff[0] = b->diff[i];
+ }
+}
+
+static void recon_dcblock_8x8(MACROBLOCKD *xd) {
+ BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10
+
+ xd->block[0].dqcoeff[0] = b->diff[0];
+ xd->block[4].dqcoeff[0] = b->diff[1];
+ xd->block[8].dqcoeff[0] = b->diff[4];
+ xd->block[12].dqcoeff[0] = b->diff[8];
+}
+
+void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ BLOCKD *b, int pitch) {
+ if (b->eob <= 1)
+ IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+ else
+ IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+}
+
+void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ int i;
+ BLOCKD *blockd = xd->block;
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+ /* do 2nd order transform on the dc block */
+ IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);
+ recon_dcblock(xd);
+ }
+
+ for (i = 0; i < 16; i++) {
+ vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);
+ }
+}
+
+void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ int i;
+ BLOCKD *blockd = xd->block;
+
+ for (i = 16; i < 24; i++) {
+ vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);
+ }
+}
+
+void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ vp9_inverse_transform_mby_4x4(rtcd, xd);
+ vp9_inverse_transform_mbuv_4x4(rtcd, xd);
+}
+
+void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ short *input_dqcoeff, short *output_coeff,
+ int pitch) {
+ // int b,i;
+ // if (b->eob > 1)
+ IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
+ // else
+ // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
+}
+
+void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ int i;
+ BLOCKD *blockd = xd->block;
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+ // do 2nd order transform on the dc block
+ IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);
+ recon_dcblock_8x8(xd); // need to change for 8x8
+ }
+
+ for (i = 0; i < 9; i += 8) {
+ vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+ &blockd[i].diff[0], 32);
+ }
+ for (i = 2; i < 11; i += 8) {
+ vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],
+ &blockd[i].diff[0], 32);
+ }
+}
+
+void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ int i;
+ BLOCKD *blockd = xd->block;
+
+ for (i = 16; i < 24; i += 4) {
+ vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+ &blockd[i].diff[0], 16);
+ }
+}
+
+void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ vp9_inverse_transform_mby_8x8(rtcd, xd);
+ vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+}
+
+void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ short *input_dqcoeff,
+ short *output_coeff, int pitch) {
+ IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);
+}
+
+void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],
+ &xd->block[0].diff[0], 32);
+}
+
+void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd) {
+ vp9_inverse_transform_mby_16x16(rtcd, xd);
+ vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+}
--- /dev/null
+++ b/vp9/common/invtrans.h
@@ -1,0 +1,53 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_INVTRANS_H
+#define __INC_INVTRANS_H
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "blockd.h"
+
+extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ BLOCKD *b, int pitch);
+
+extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ short *input_dqcoeff,
+ short *output_coeff, int pitch);
+
+extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ short *input_dqcoeff,
+ short *output_coeff, int pitch);
+
+extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd);
+
+#endif // __INC_INVTRANS_H
--- /dev/null
+++ b/vp9/common/loopfilter.c
@@ -1,0 +1,524 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/seg_common.h"
+
+static void lf_init_lut(loop_filter_info_n *lfi) {
+ int filt_lvl;
+
+ for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {
+ if (filt_lvl >= 40) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+ } else if (filt_lvl >= 20) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+ } else if (filt_lvl >= 15) {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+ } else {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+ }
+ }
+
+ lfi->mode_lf_lut[DC_PRED] = 1;
+ lfi->mode_lf_lut[D45_PRED] = 1;
+ lfi->mode_lf_lut[D135_PRED] = 1;
+ lfi->mode_lf_lut[D117_PRED] = 1;
+ lfi->mode_lf_lut[D153_PRED] = 1;
+ lfi->mode_lf_lut[D27_PRED] = 1;
+ lfi->mode_lf_lut[D63_PRED] = 1;
+ lfi->mode_lf_lut[V_PRED] = 1;
+ lfi->mode_lf_lut[H_PRED] = 1;
+ lfi->mode_lf_lut[TM_PRED] = 1;
+ lfi->mode_lf_lut[B_PRED] = 0;
+ lfi->mode_lf_lut[I8X8_PRED] = 0;
+ lfi->mode_lf_lut[ZEROMV] = 1;
+ lfi->mode_lf_lut[NEARESTMV] = 2;
+ lfi->mode_lf_lut[NEARMV] = 2;
+ lfi->mode_lf_lut[NEWMV] = 2;
+ lfi->mode_lf_lut[SPLITMV] = 3;
+}
+
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl) {
+ int i;
+
+ /* For each possible value for the loop filter fill out limits */
+ for (i = 0; i <= MAX_LOOP_FILTER; i++) {
+ int filt_lvl = i;
+ int block_inside_limit = 0;
+
+ /* Set loop filter paramaeters that control sharpness. */
+ block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+ block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+ if (sharpness_lvl > 0) {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1)
+ block_inside_limit = 1;
+
+ vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+ SIMD_WIDTH);
+ vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+void vp9_loop_filter_init(VP9_COMMON *cm) {
+ loop_filter_info_n *lfi = &cm->lf_info;
+ int i;
+
+ /* init limits for given sharpness*/
+ vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ /* init LUT for lvl and hev thr picking */
+ lf_init_lut(lfi);
+
+ /* init hev threshold const vectors */
+ for (i = 0; i < 4; i++) {
+ vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ }
+}
+
+void vp9_loop_filter_frame_init(VP9_COMMON *cm,
+ MACROBLOCKD *xd,
+ int default_filt_lvl) {
+ int seg, /* segment number */
+ ref, /* index in ref_lf_deltas */
+ mode; /* index in mode_lf_deltas */
+
+ loop_filter_info_n *lfi = &cm->lf_info;
+
+ /* update limits if sharpness has changed */
+ if (cm->last_sharpness_level != cm->sharpness_level) {
+ vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
+ int lvl_seg = default_filt_lvl;
+ int lvl_ref, lvl_mode;
+
+
+ // Set the baseline filter values for each segment
+ if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
+ /* Abs value */
+ if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
+ lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+ } else { /* Delta Value */
+ lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
+ lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
+ }
+ }
+
+ if (!xd->mode_ref_lf_delta_enabled) {
+ /* we could get rid of this if we assume that deltas are set to
+ * zero when not in use; encoder always uses deltas
+ */
+ vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+ continue;
+ }
+
+ lvl_ref = lvl_seg;
+
+ /* INTRA_FRAME */
+ ref = INTRA_FRAME;
+
+ /* Apply delta for reference frame */
+ lvl_ref += xd->ref_lf_deltas[ref];
+
+ /* Apply delta for Intra modes */
+ mode = 0; /* B_PRED */
+ /* Only the split mode BPRED has a further special case */
+ lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ mode = 1; /* all the rest of Intra modes */
+ lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ /* LAST, GOLDEN, ALT */
+ for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
+ int lvl_ref = lvl_seg;
+
+ /* Apply delta for reference frame */
+ lvl_ref += xd->ref_lf_deltas[ref];
+
+ /* Apply delta for Inter modes */
+ for (mode = 1; mode < 4; mode++) {
+ lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+ }
+ }
+ }
+}
+
+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ struct loop_filter_info lfi;
+
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ int mb_row;
+ int mb_col;
+
+ int filter_level;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+
+ /* Initialize the loop filter for this frame. */
+ vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+ u_ptr = post->u_buffer;
+ v_ptr = post->v_buffer;
+
+ /* vp9_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != I8X8_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+ int tx_type = mode_info_context->mbmi.txfm_size;
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+ && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+ mode_info_context[0].mbmi.mb_skip_coeff &&
+ mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+ )
+ vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+
+ if (!skip_lf && tx_type != TX_16X16) {
+ if (tx_type == TX_8X8)
+ vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+ else
+ vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+
+ }
+
+ /* don't apply across umv border */
+ if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+ && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+ mode_info_context[0].mbmi.mb_skip_coeff &&
+ mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+ )
+ vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+
+ if (!skip_lf && tx_type != TX_16X16) {
+ if (tx_type == TX_8X8)
+ vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+ else
+ vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
+ post->uv_stride, &lfi);
+ }
+ } else {
+ // FIXME: Not 8x8 aware
+ if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+ && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+ mode_info_context[0].mbmi.mb_skip_coeff &&
+ mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+ )
+ vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+ && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+ mode_info_context[0].mbmi.mb_skip_coeff &&
+ mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+ )
+ vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ u_ptr += post->uv_stride * 8 - post->uv_width;
+ v_ptr += post->uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+ }
+}
+
+void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int default_filt_lvl) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ struct loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+ if (default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+
+ /* vp9_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != I8X8_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+ int tx_type = mode_info_context->mbmi.txfm_size;
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level) {
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf && tx_type != TX_16X16) {
+ if (tx_type == TX_8X8)
+ vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ else
+ vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf && tx_type != TX_16X16) {
+ if (tx_type == TX_8X8)
+ vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ else
+ vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+ } else {
+ // FIXME: Not 8x8 aware
+ if (mb_col > 0)
+ vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context++; /* Skip border mb */
+ }
+}
+
+void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int default_filt_lvl) {
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+ int mb_cols = post->y_width >> 4;
+
+ int linestocopy, i;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ struct loop_filter_info lfi;
+
+ int filter_level;
+ int alt_flt_enabled = xd->segmentation_enabled;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ const MODE_INFO *mode_info_context;
+
+ int lvl_seg[MAX_MB_SEGMENTS];
+
+ mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+ /* 3 is a magic number. 4 is probably magic too */
+ linestocopy = (post->y_height >> (4 + 3));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+ /* Note the baseline filter values for each segment */
+ /* See vp9_loop_filter_frame_init. Rather than call that for each change
+ * to default_filt_lvl, copy the relevant calculation here.
+ */
+ if (alt_flt_enabled) {
+ for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ /* Abs value */
+ if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
+ lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
+ }
+ /* Delta Value */
+ else {
+ lvl_seg[i] = default_filt_lvl +
+ vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
+ lvl_seg[i] = (lvl_seg[i] > 0) ?
+ ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;
+ }
+ }
+ }
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
+
+ /* vp9_filter each macro block */
+ for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != I8X8_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ if (alt_flt_enabled)
+ filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+ else
+ filter_level = default_filt_lvl;
+
+ if (filter_level) {
+ if (cm->filter_type == NORMAL_LOOPFILTER) {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ } else {
+ if (mb_col > 0)
+ vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+
+ vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
+ lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
+ lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context += 1; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context += 1; /* Skip border mb */
+ }
+}
--- /dev/null
+++ b/vp9/common/loopfilter.h
@@ -1,0 +1,104 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef loopfilter_h
+#define loopfilter_h
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "blockd.h"
+
+#define MAX_LOOP_FILTER 63
+
+typedef enum {
+ NORMAL_LOOPFILTER = 0,
+ SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct {
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ hev_thr[4][SIMD_WIDTH]);
+ unsigned char lvl[4][4][4];
+ unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+ unsigned char mode_lf_lut[MB_MODE_COUNT];
+} loop_filter_info_n;
+
+struct loop_filter_info {
+ const unsigned char *mblim;
+ const unsigned char *blim;
+ const unsigned char *lim;
+ const unsigned char *hev_thr;
+};
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_block(sym) \
+ void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+ int ystride, int uv_stride, struct loop_filter_info *lfi)
+
+#define prototype_simple_loopfilter(sym) \
+ void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/loopfilter_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/loopfilter_arm.h"
+#endif
+
+typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ unsigned char *v);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP9Common;
+struct macroblockd;
+
+void vp9_loop_filter_init(struct VP9Common *cm);
+
+void vp9_loop_filter_frame_init(struct VP9Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
+
+void vp9_loop_filter_partial_frame(struct VP9Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl);
+
+#endif // loopfilter_h
--- /dev/null
+++ b/vp9/common/loopfilter_filters.c
@@ -1,0 +1,480 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static __inline signed char signed_char_clamp(int t) {
+ t = (t < -128 ? -128 : t);
+ t = (t > 127 ? 127 : t);
+ return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static __inline signed char filter_mask(uc limit, uc blimit,
+ uc p3, uc p2, uc p1, uc p0,
+ uc q0, uc q1, uc q2, uc q3) {
+ signed char mask = 0;
+ mask |= (abs(p3 - p2) > limit) * -1;
+ mask |= (abs(p2 - p1) > limit) * -1;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(q2 - q1) > limit) * -1;
+ mask |= (abs(q3 - q2) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = ~mask;
+ return mask;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
+ signed char hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static __inline void filter(signed char mask, uc hev, uc *op1,
+ uc *op0, uc *oq0, uc *oq1)
+
+{
+ signed char ps0, qs0;
+ signed char ps1, qs1;
+ signed char filter, Filter1, Filter2;
+ signed char u;
+
+ ps1 = (signed char) * op1 ^ 0x80;
+ ps0 = (signed char) * op0 ^ 0x80;
+ qs0 = (signed char) * oq0 ^ 0x80;
+ qs1 = (signed char) * oq1 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter = signed_char_clamp(ps1 - qs1);
+ filter &= hev;
+
+ /* inner taps */
+ filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+ filter &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3
+ * if it equals 4 we'll set to adjust by -1 to account for the fact
+ * we'd round 3 the other way
+ */
+ Filter1 = signed_char_clamp(filter + 4);
+ Filter2 = signed_char_clamp(filter + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ u = signed_char_clamp(qs0 - Filter1);
+ *oq0 = u ^ 0x80;
+ u = signed_char_clamp(ps0 + Filter2);
+ *op0 = u ^ 0x80;
+ filter = Filter1;
+
+ /* outer tap adjustments */
+ filter += 1;
+ filter >>= 1;
+ filter &= ~hev;
+
+ u = signed_char_clamp(qs1 - filter);
+ *oq1 = u ^ 0x80;
+ u = signed_char_clamp(ps1 + filter);
+ *op1 = u ^ 0x80;
+
+}
+
+void vp9_loop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+) {
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+ mask = filter_mask(limit[0], blimit[0],
+ s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+ s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
+
+ hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+ filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+ ++s;
+ } while (++i < count * 8);
+}
+
+void vp9_loop_filter_vertical_edge_c(unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+ mask = filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1],
+ s[0], s[1], s[2], s[3]);
+
+ hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+ s += p;
+ } while (++i < count * 8);
+}
+static __inline signed char flatmask(uc thresh,
+ uc p4, uc p3, uc p2, uc p1, uc p0,
+ uc q0, uc q1, uc q2, uc q3, uc q4) {
+ signed char flat = 0;
+ flat |= (abs(p1 - p0) > 1) * -1;
+ flat |= (abs(q1 - q0) > 1) * -1;
+ flat |= (abs(p0 - p2) > 1) * -1;
+ flat |= (abs(q0 - q2) > 1) * -1;
+ flat |= (abs(p3 - p0) > 1) * -1;
+ flat |= (abs(q3 - q0) > 1) * -1;
+ flat |= (abs(p4 - p0) > 1) * -1;
+ flat |= (abs(q4 - q0) > 1) * -1;
+ flat = ~flat;
+ return flat;
+}
+
+static __inline void mbfilter(signed char mask, uc hev, uc flat,
+ uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
+ uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ if (flat && mask) {
+ unsigned char p0, q0;
+ unsigned char p1, q1;
+ unsigned char p2, q2;
+ unsigned char p3, q3;
+ unsigned char p4, q4;
+
+ p4 = *op4;
+ p3 = *op3;
+ p2 = *op2;
+ p1 = *op1;
+ p0 = *op0;
+ q0 = *oq0;
+ q1 = *oq1;
+ q2 = *oq2;
+ q3 = *oq3;
+ q4 = *oq4;
+
+ *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+ *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+ *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
+ *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
+ *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
+ *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+ } else {
+ signed char ps0, qs0;
+ signed char ps1, qs1;
+ signed char filter, Filter1, Filter2;
+ signed char u;
+
+ ps1 = (signed char) * op1 ^ 0x80;
+ ps0 = (signed char) * op0 ^ 0x80;
+ qs0 = (signed char) * oq0 ^ 0x80;
+ qs1 = (signed char) * oq1 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ filter = signed_char_clamp(ps1 - qs1);
+ filter &= hev;
+
+ /* inner taps */
+ filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+ filter &= mask;
+
+ Filter1 = signed_char_clamp(filter + 4);
+ Filter2 = signed_char_clamp(filter + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+
+ u = signed_char_clamp(qs0 - Filter1);
+ *oq0 = u ^ 0x80;
+ u = signed_char_clamp(ps0 + Filter2);
+ *op0 = u ^ 0x80;
+ filter = Filter1;
+
+ /* outer tap adjustments */
+ filter += 1;
+ filter >>= 1;
+ filter &= ~hev;
+
+ u = signed_char_clamp(qs1 - filter);
+ *oq1 = u ^ 0x80;
+ u = signed_char_clamp(ps1 + filter);
+ *op1 = u ^ 0x80;
+ }
+}
+void vp9_mbloop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+) {
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ signed char flat = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do {
+
+ mask = filter_mask(limit[0], blimit[0],
+ s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+ s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+
+ hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+ flat = flatmask(thresh[0],
+ s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+ s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+ mbfilter(mask, hev, flat,
+ s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+ s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);
+
+ ++s;
+ } while (++i < count * 8);
+
+}
+void vp9_mbloop_filter_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+) {
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ signed char flat = 0;
+ int i = 0;
+
+ do {
+
+ mask = filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1],
+ s[0], s[1], s[2], s[3]);
+
+ hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+ flat = flatmask(thresh[0],
+ s[-5], s[-4], s[-3], s[-2], s[-1],
+ s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
+ mbfilter(mask, hev, flat,
+ s - 5, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3, s + 4);
+ s += p;
+ } while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static __inline signed char simple_filter_mask(uc blimit,
+ uc p1, uc p0,
+ uc q0, uc q1) {
+ /* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ * (void) limit;
+ */
+ signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
+ return mask;
+}
+
+static __inline void simple_filter(signed char mask,
+ uc *op1, uc *op0,
+ uc *oq0, uc *oq1) {
+ signed char filter, Filter1, Filter2;
+ signed char p1 = (signed char) * op1 ^ 0x80;
+ signed char p0 = (signed char) * op0 ^ 0x80;
+ signed char q0 = (signed char) * oq0 ^ 0x80;
+ signed char q1 = (signed char) * oq1 ^ 0x80;
+ signed char u;
+
+ filter = signed_char_clamp(p1 - q1);
+ filter = signed_char_clamp(filter + 3 * (q0 - p0));
+ filter &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = signed_char_clamp(filter + 4);
+ Filter1 >>= 3;
+ u = signed_char_clamp(q0 - Filter1);
+ *oq0 = u ^ 0x80;
+
+ Filter2 = signed_char_clamp(filter + 3);
+ Filter2 >>= 3;
+ u = signed_char_clamp(p0 + Filter2);
+ *op0 = u ^ 0x80;
+}
+
+void vp9_loop_filter_simple_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+) {
+ signed char mask = 0;
+ int i = 0;
+
+ do {
+ mask = simple_filter_mask(blimit[0],
+ s[-2 * p], s[-1 * p],
+ s[0 * p], s[1 * p]);
+ simple_filter(mask,
+ s - 2 * p, s - 1 * p,
+ s, s + 1 * p);
+ ++s;
+ } while (++i < 16);
+}
+
+void vp9_loop_filter_simple_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+) {
+ signed char mask = 0;
+ int i = 0;
+
+ do {
+ mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+ simple_filter(mask, s - 2, s - 1, s, s + 1);
+ s += p;
+ } while (++i < 16);
+
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
+ lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_horizontal_edge_c(
+ y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
+ y_stride, blimit);
+}
+
+void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_vertical_edge_c(
+ y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
--- /dev/null
+++ b/vp9/common/maskingmv.c
@@ -1,0 +1,806 @@
+/*
+ ============================================================================
+ Name : maskingmv.c
+ Author : jimbankoski
+ Version :
+ Copyright : Your copyright notice
+ Description : Hello World in C, Ansi-style
+ ============================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+extern unsigned int vp9_sad16x16_sse3(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int max_err);
+
+extern void vp9_sad16x16x3_sse3(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ int *results);
+
+extern int vp8_growmaskmb_sse3(
+ unsigned char *om,
+ unsigned char *nm);
+
+extern void vp8_makemask_sse3(
+ unsigned char *y,
+ unsigned char *u,
+ unsigned char *v,
+ unsigned char *ym,
+ int yp,
+ int uvp,
+ int ys,
+ int us,
+ int vs,
+ int yt,
+ int ut,
+ int vt);
+
+unsigned int vp9_sad16x16_unmasked_wmt(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned char *mask);
+
+unsigned int vp9_sad16x16_masked_wmt(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned char *mask);
+
+unsigned int vp8_masked_predictor_wmt(
+ unsigned char *masked,
+ unsigned char *unmasked,
+ int src_stride,
+ unsigned char *dst_ptr,
+ int dst_stride,
+ unsigned char *mask);
+unsigned int vp8_masked_predictor_uv_wmt(
+ unsigned char *masked,
+ unsigned char *unmasked,
+ int src_stride,
+ unsigned char *dst_ptr,
+ int dst_stride,
+ unsigned char *mask);
+unsigned int vp8_uv_from_y_mask(
+ unsigned char *ymask,
+ unsigned char *uvmask);
+int yp = 16;
+unsigned char sxy[] = {
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
+};
+
+unsigned char sts[] = {
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+};
+unsigned char str[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+unsigned char y[] = {
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+ 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+ 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+ 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
+ 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
+};
+int uvp = 8;
+unsigned char u[] = {
+ 90, 80, 70, 70, 90, 90, 90, 17,
+ 90, 80, 70, 70, 90, 90, 90, 17,
+ 84, 70, 70, 90, 90, 90, 17, 17,
+ 84, 70, 70, 90, 90, 90, 17, 17,
+ 80, 70, 70, 90, 90, 90, 17, 17,
+ 90, 80, 70, 70, 90, 90, 90, 17,
+ 90, 80, 70, 70, 90, 90, 90, 17,
+ 90, 80, 70, 70, 90, 90, 90, 17
+};
+
+unsigned char v[] = {
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80
+};
+
+unsigned char ym[256];
+unsigned char uvm[64];
+typedef struct {
+ unsigned char y;
+ unsigned char yt;
+ unsigned char u;
+ unsigned char ut;
+ unsigned char v;
+ unsigned char vt;
+ unsigned char use;
+} COLOR_SEG_ELEMENT;
+
+/*
+COLOR_SEG_ELEMENT segmentation[]=
+{
+ { 60,4,80,17,80,10, 1},
+ { 40,4,15,10,80,10, 1},
+};
+*/
+
+COLOR_SEG_ELEMENT segmentation[] = {
+ { 79, 44, 92, 44, 237, 60, 1},
+};
+
+unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
+ COLOR_SEG_ELEMENT sgm[],
+ int c) {
+ COLOR_SEG_ELEMENT *s = sgm;
+ unsigned char m = 0;
+ int i;
+ for (i = 0; i < c; i++, s++)
+ m |= (abs(y - s->y) < s->yt &&
+ abs(u - s->u) < s->ut &&
+ abs(v - s->v) < s->vt ? 255 : 0);
+
+ return m;
+}
+int neighbors[256][8];
+int makeneighbors(void) {
+ int i, j;
+ for (i = 0; i < 256; i++) {
+ int r = (i >> 4), c = (i & 15);
+ int ni = 0;
+ for (j = 0; j < 8; j++)
+ neighbors[i][j] = i;
+ for (j = 0; j < 256; j++) {
+ int nr = (j >> 4), nc = (j & 15);
+ if (abs(nr - r) < 2 && abs(nc - c) < 2)
+ neighbors[i][ni++] = j;
+ }
+ }
+ return 0;
+}
+void grow_ymask(unsigned char *ym) {
+ unsigned char nym[256];
+ int i, j;
+
+ for (i = 0; i < 256; i++) {
+ nym[i] = ym[i];
+ for (j = 0; j < 8; j++) {
+ nym[i] |= ym[neighbors[i][j]];
+ }
+ }
+ for (i = 0; i < 256; i++)
+ ym[i] = nym[i];
+}
+void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
+ unsigned char *ym, unsigned char *uvm,
+ int yp, int uvp,
+ COLOR_SEG_ELEMENT sgm[],
+ int count) {
+ int r, c;
+ unsigned char *oym = ym;
+
+ memset(ym, 20, 256);
+ for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
+ for (c = 0; c < 8; c++) {
+ int y1 = y[c << 1];
+ int u1 = u[c];
+ int v1 = v[c];
+ int m = pixel_mask(y1, u1, v1, sgm, count);
+ uvm[c] = m;
+ ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
+ ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
+ ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
+ ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
+ }
+ grow_ymask(oym);
+}
+
+int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+ unsigned char *ym) {
+ int i, j;
+ unsigned sad = 0;
+ for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+ for (j = 0; j < 16; j++)
+ if (ym[j])
+ sad += abs(src[j] - dst[j]);
+
+ return sad;
+}
+
+int compare_masks(unsigned char *sym, unsigned char *ym) {
+ int i, j;
+ unsigned sad = 0;
+ for (i = 0; i < 16; i++, sym += 16, ym += 16)
+ for (j = 0; j < 16; j++)
+ sad += (sym[j] != ym[j] ? 1 : 0);
+
+ return sad;
+}
+int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
+ unsigned char *ym) {
+ int i, j;
+ unsigned sad = 0;
+ for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
+ for (j = 0; j < 16; j++)
+ if (!ym[j])
+ sad += abs(src[j] - dst[j]);
+
+ return sad;
+}
+int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+ int yp, int uvp,
+ unsigned char *dy, unsigned char *du, unsigned char *dv,
+ int dyp, int duvp,
+ COLOR_SEG_ELEMENT sgm[],
+ int count,
+ int *mi,
+ int *mj,
+ int *ui,
+ int *uj,
+ int *wm) {
+ int i, j;
+
+ unsigned char ym[256];
+ unsigned char uvm[64];
+ unsigned char dym[256];
+ unsigned char duvm[64];
+ unsigned int e = 0;
+ int beste = 256;
+ int bmi = -32, bmj = -32;
+ int bui = -32, buj = -32;
+ int beste1 = 256;
+ int bmi1 = -32, bmj1 = -32;
+ int bui1 = -32, buj1 = -32;
+ int obeste;
+
+ // first try finding best mask and then unmasked
+ beste = 0xffffffff;
+
+ // find best unmasked mv
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ unsigned char *duz = i / 2 * duvp + du;
+ unsigned char *dvz = i / 2 * duvp + dv;
+ for (j = -32; j < 32; j++) {
+ // 0,0 masked destination
+ make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+ e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+ if (e < beste) {
+ bui = i;
+ buj = j;
+ beste = e;
+ }
+ }
+ }
+ // bui=0;buj=0;
+ // best mv masked destination
+ make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+ dym, duvm, dyp, duvp, sgm, count);
+
+ obeste = beste;
+ beste = 0xffffffff;
+
+ // find best masked
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ for (j = -32; j < 32; j++) {
+ e = masked_sad(y, yp, dyz + j, dyp, dym);
+
+ if (e < beste) {
+ bmi = i;
+ bmj = j;
+ beste = e;
+ }
+ }
+ }
+ beste1 = beste + obeste;
+ bmi1 = bmi;
+ bmj1 = bmj;
+ bui1 = bui;
+ buj1 = buj;
+
+ beste = 0xffffffff;
+ // source mask
+ make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
+
+ // find best mask
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ unsigned char *duz = i / 2 * duvp + du;
+ unsigned char *dvz = i / 2 * duvp + dv;
+ for (j = -32; j < 32; j++) {
+ // 0,0 masked destination
+ make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
+
+ e = compare_masks(ym, dym);
+
+ if (e < beste) {
+ bmi = i;
+ bmj = j;
+ beste = e;
+ }
+ }
+ }
+
+
+ // best mv masked destination
+ make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+ dym, duvm, dyp, duvp, sgm, count);
+
+ obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
+
+ beste = 0xffffffff;
+
+ // find best unmasked mv
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ for (j = -32; j < 32; j++) {
+ e = unmasked_sad(y, yp, dyz + j, dyp, dym);
+
+ if (e < beste) {
+ bui = i;
+ buj = j;
+ beste = e;
+ }
+ }
+ }
+ beste += obeste;
+
+
+ if (beste < beste1) {
+ *mi = bmi;
+ *mj = bmj;
+ *ui = bui;
+ *uj = buj;
+ *wm = 1;
+ } else {
+ *mi = bmi1;
+ *mj = bmj1;
+ *ui = bui1;
+ *uj = buj1;
+ *wm = 0;
+
+ }
+ return 0;
+}
+
+int predict(unsigned char *src, int p, unsigned char *dst, int dp,
+ unsigned char *ym, unsigned char *prd) {
+ int i, j;
+ for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
+ for (j = 0; j < 16; j++)
+ prd[j] = (ym[j] ? src[j] : dst[j]);
+ return 0;
+}
+
+int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
+ int yp, int uvp,
+ unsigned char *dy, unsigned char *du, unsigned char *dv,
+ int dyp, int duvp,
+ COLOR_SEG_ELEMENT sgm[],
+ int count,
+ int *mi,
+ int *mj,
+ int *ui,
+ int *uj,
+ int *wm) {
+ int i, j;
+
+ unsigned char ym[256];
+ unsigned char ym2[256];
+ unsigned char uvm[64];
+ unsigned char dym2[256];
+ unsigned char dym[256];
+ unsigned char duvm[64];
+ unsigned int e = 0;
+ int beste = 256;
+ int bmi = -32, bmj = -32;
+ int bui = -32, buj = -32;
+ int beste1 = 256;
+ int bmi1 = -32, bmj1 = -32;
+ int bui1 = -32, buj1 = -32;
+ int obeste;
+
+ // first try finding best mask and then unmasked
+ beste = 0xffffffff;
+
+#if 0
+ for (i = 0; i < 16; i++) {
+ unsigned char *dy = i * yp + y;
+ for (j = 0; j < 16; j++)
+ printf("%2x", dy[j]);
+ printf("\n");
+ }
+ printf("\n");
+
+ for (i = -32; i < 48; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ for (j = -32; j < 48; j++)
+ printf("%2x", dyz[j]);
+ printf("\n");
+ }
+#endif
+
+ // find best unmasked mv
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ unsigned char *duz = i / 2 * duvp + du;
+ unsigned char *dvz = i / 2 * duvp + dv;
+ for (j = -32; j < 32; j++) {
+ // 0,0 masked destination
+ vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(dym, dym2);
+
+ e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+ if (e < beste) {
+ bui = i;
+ buj = j;
+ beste = e;
+ }
+ }
+ }
+ // bui=0;buj=0;
+ // best mv masked destination
+
+ vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
+ dym, dyp, duvp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(dym, dym2);
+
+ obeste = beste;
+ beste = 0xffffffff;
+
+ // find best masked
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ for (j = -32; j < 32; j++) {
+ e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
+ if (e < beste) {
+ bmi = i;
+ bmj = j;
+ beste = e;
+ }
+ }
+ }
+ beste1 = beste + obeste;
+ bmi1 = bmi;
+ bmj1 = bmj;
+ bui1 = bui;
+ buj1 = buj;
+
+ // source mask
+ vp8_makemask_sse3(y, u, v,
+ ym, yp, uvp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(ym, ym2);
+
+ // find best mask
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ unsigned char *duz = i / 2 * duvp + du;
+ unsigned char *dvz = i / 2 * duvp + dv;
+ for (j = -32; j < 32; j++) {
+ // 0,0 masked destination
+ vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(dym, dym2);
+
+ e = compare_masks(ym2, dym2);
+
+ if (e < beste) {
+ bmi = i;
+ bmj = j;
+ beste = e;
+ }
+ }
+ }
+
+ vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
+ dym, dyp, duvp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(dym, dym2);
+
+ obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
+
+ beste = 0xffffffff;
+
+ // find best unmasked mv
+ for (i = -32; i < 32; i++) {
+ unsigned char *dyz = i * dyp + dy;
+ for (j = -32; j < 32; j++) {
+ e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
+
+ if (e < beste) {
+ bui = i;
+ buj = j;
+ beste = e;
+ }
+ }
+ }
+ beste += obeste;
+
+ if (beste < beste1) {
+ *mi = bmi;
+ *mj = bmj;
+ *ui = bui;
+ *uj = buj;
+ *wm = 1;
+ } else {
+ *mi = bmi1;
+ *mj = bmj1;
+ *ui = bui1;
+ *uj = buj1;
+ *wm = 0;
+ beste = beste1;
+
+ }
+ return beste;
+}
+
+int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
+ int ymp, int uvmp,
+ unsigned char *yp, unsigned char *up, unsigned char *vp,
+ int ypp, int uvpp,
+ COLOR_SEG_ELEMENT sgm[],
+ int count,
+ int mi,
+ int mj,
+ int ui,
+ int uj,
+ int wm) {
+ int i, j;
+ unsigned char dym[256];
+ unsigned char dym2[256];
+ unsigned char duvm[64];
+ unsigned char *yu = ym, *uu = um, *vu = vm;
+
+ unsigned char *dym3 = dym2;
+
+ ym += mi * ymp + mj;
+ um += mi / 2 * uvmp + mj / 2;
+ vm += mi / 2 * uvmp + mj / 2;
+
+ yu += ui * ymp + uj;
+ uu += ui / 2 * uvmp + uj / 2;
+ vu += ui / 2 * uvmp + uj / 2;
+
+ // best mv masked destination
+ if (wm)
+ vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+ else
+ vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
+ sgm[0].y, sgm[0].u, sgm[0].v,
+ sgm[0].yt, sgm[0].ut, sgm[0].vt);
+
+ vp8_growmaskmb_sse3(dym, dym2);
+ vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
+ vp8_uv_from_y_mask(dym3, duvm);
+ vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
+ vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
+
+ return 0;
+}
+
+unsigned char f0p[1280 * 720 * 3 / 2];
+unsigned char f1p[1280 * 720 * 3 / 2];
+unsigned char prd[1280 * 720 * 3 / 2];
+unsigned char msk[1280 * 720 * 3 / 2];
+
+
+int mainz(int argc, char *argv[]) {
+
+ FILE *f = fopen(argv[1], "rb");
+ FILE *g = fopen(argv[2], "wb");
+ int w = atoi(argv[3]), h = atoi(argv[4]);
+ int y_stride = w, uv_stride = w / 2;
+ int r, c;
+ unsigned char *f0 = f0p, *f1 = f1p, *t;
+ unsigned char ym[256], uvm[64];
+ unsigned char ym2[256], uvm2[64];
+ unsigned char ym3[256], uvm3[64];
+ int a, b;
+
+ COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
+#if 0
+ makeneighbors();
+ COLOR_SEG_ELEMENT segmentation[] = {
+ { 60, 4, 80, 17, 80, 10, 1},
+ { 40, 4, 15, 10, 80, 10, 1},
+ };
+ make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
+
+ vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
+ (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
+ segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
+
+ vp8_growmaskmb_sse3(ym, ym3);
+
+ a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
+ b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
+
+ vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
+
+ vp8_uv_from_y_mask(ym3, uvm3);
+
+ return 4;
+#endif
+ makeneighbors();
+
+
+ memset(prd, 128, w * h * 3 / 2);
+
+ fread(f0, w * h * 3 / 2, 1, f);
+
+ while (!feof(f)) {
+ unsigned char *ys = f1, *yd = f0, *yp = prd;
+ unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
+ unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
+ fread(f1, w * h * 3 / 2, 1, f);
+
+ ys += 32 * y_stride;
+ yd += 32 * y_stride;
+ yp += 32 * y_stride;
+ us += 16 * uv_stride;
+ ud += 16 * uv_stride;
+ up += 16 * uv_stride;
+ vs += 16 * uv_stride;
+ vd += 16 * uv_stride;
+ vp += 16 * uv_stride;
+ for (r = 32; r < h - 32; r += 16,
+ ys += 16 * w, yd += 16 * w, yp += 16 * w,
+ us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
+ vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
+ for (c = 32; c < w - 32; c += 16) {
+ int mi, mj, ui, uj, wm;
+ int bmi, bmj, bui, buj, bwm;
+ unsigned char ym[256];
+
+ if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
+ bmi = bmj = bui = buj = bwm = 0;
+ else {
+ COLOR_SEG_ELEMENT cs[5];
+ int j;
+ unsigned int beste = 0xfffffff;
+ unsigned int bestj = 0;
+
+ // try color from last mb segmentation
+ cs[0] = last;
+
+ // try color segs from 4 pixels in mb recon as segmentation
+ cs[1].y = yd[c + y_stride + 1];
+ cs[1].u = ud[c / 2 + uv_stride];
+ cs[1].v = vd[c / 2 + uv_stride];
+ cs[1].yt = cs[1].ut = cs[1].vt = 20;
+ cs[2].y = yd[c + w + 14];
+ cs[2].u = ud[c / 2 + uv_stride + 7];
+ cs[2].v = vd[c / 2 + uv_stride + 7];
+ cs[2].yt = cs[2].ut = cs[2].vt = 20;
+ cs[3].y = yd[c + w * 14 + 1];
+ cs[3].u = ud[c / 2 + uv_stride * 7];
+ cs[3].v = vd[c / 2 + uv_stride * 7];
+ cs[3].yt = cs[3].ut = cs[3].vt = 20;
+ cs[4].y = yd[c + w * 14 + 14];
+ cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
+ cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
+ cs[4].yt = cs[4].ut = cs[4].vt = 20;
+
+ for (j = 0; j < 5; j++) {
+ int e;
+
+ e = fast_masked_motion_search(
+ ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
+ yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
+ &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
+
+ if (e < beste) {
+ bmi = mi;
+ bmj = mj;
+ bui = ui;
+ buj = uj, bwm = wm;
+ bestj = j;
+ beste = e;
+ }
+ }
+ best = cs[bestj];
+ // best = segmentation[0];
+ last = best;
+ }
+ predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
+ yp + c, up + c / 2, vp + c / 2, w, uv_stride,
+ &best, 1, bmi, bmj, bui, buj, bwm);
+
+ }
+ }
+ fwrite(prd, w * h * 3 / 2, 1, g);
+ t = f0;
+ f0 = f1;
+ f1 = t;
+
+ }
+ fclose(f);
+ fclose(g);
+ return;
+}
--- /dev/null
+++ b/vp9/common/mbpitch.c
@@ -1,0 +1,124 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+typedef enum {
+ PRED = 0,
+ DEST = 1
+} BLOCKSET;
+
+static void setup_block
+(
+ BLOCKD *b,
+ int mv_stride,
+ unsigned char **base,
+ unsigned char **base2,
+ int Stride,
+ int offset,
+ BLOCKSET bs
+) {
+
+ if (bs == DEST) {
+ b->dst_stride = Stride;
+ b->dst = offset;
+ b->base_dst = base;
+ } else {
+ b->pre_stride = Stride;
+ b->pre = offset;
+ b->base_pre = base;
+ b->base_second_pre = base2;
+ }
+
+}
+
+
+static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
+ int block;
+
+ unsigned char **y, **u, **v;
+ unsigned char **y2, **u2, **v2;
+ BLOCKD *blockd = xd->block;
+ int stride;
+
+ if (bs == DEST) {
+ y = &xd->dst.y_buffer;
+ u = &xd->dst.u_buffer;
+ v = &xd->dst.v_buffer;
+ } else {
+ y = &xd->pre.y_buffer;
+ u = &xd->pre.u_buffer;
+ v = &xd->pre.v_buffer;
+
+ y2 = &xd->second_pre.y_buffer;
+ u2 = &xd->second_pre.u_buffer;
+ v2 = &xd->second_pre.v_buffer;
+ }
+
+ stride = xd->dst.y_stride;
+ for (block = 0; block < 16; block++) { /* y blocks */
+ setup_block(&blockd[block], stride, y, y2, stride,
+ (block >> 2) * 4 * stride + (block & 3) * 4, bs);
+ }
+
+ stride = xd->dst.uv_stride;
+ for (block = 16; block < 20; block++) { /* U and V blocks */
+ setup_block(&blockd[block], stride, u, u2, stride,
+ ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
+
+ setup_block(&blockd[block + 4], stride, v, v2, stride,
+ ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
+ }
+}
+
+void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
+ int r, c;
+ BLOCKD *blockd = xd->block;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];
+ blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++) {
+ for (c = 0; c < 2; c++) {
+ blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];
+ blockd[16 + r * 2 + c].predictor =
+ xd->predictor + 256 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ for (r = 0; r < 2; r++) {
+ for (c = 0; c < 2; c++) {
+ blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];
+ blockd[20 + r * 2 + c].predictor =
+ xd->predictor + 320 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ blockd[24].diff = &xd->diff[384];
+
+ for (r = 0; r < 25; r++) {
+ blockd[r].qcoeff = xd->qcoeff + r * 16;
+ blockd[r].dqcoeff = xd->dqcoeff + r * 16;
+ }
+}
+
+void vp9_build_block_doffsets(MACROBLOCKD *xd) {
+
+ /* handle the destination pitch features */
+ setup_macroblock(xd, DEST);
+ setup_macroblock(xd, PRED);
+}
--- /dev/null
+++ b/vp9/common/modecont.c
@@ -1,0 +1,64 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+const int vp9_default_mode_contexts[6][4] = {
+ {
+ /* 0 */
+ 7, 1, 1, 183
+ },
+ {
+ /* 1 */
+ 14, 18, 14, 147
+ },
+ {
+ /* 2 */
+ 135, 64, 57, 68
+ },
+ {
+ /* 3 */
+ 60, 56, 128, 65
+ },
+ {
+ /* 4 */
+ 159, 134, 128, 34
+ },
+ {
+ /* 5 */
+ 234, 188, 128, 28
+ },
+};
+const int vp9_default_mode_contexts_a[6][4] = {
+ {
+ /* 0 */
+ 4, 1, 1, 143
+ },
+ {
+ /* 1 */
+ 7, 9, 7, 107
+ },
+ {
+ /* 2 */
+ 95, 34, 57, 68
+ },
+ {
+ /* 3 */
+ 95, 56, 128, 65
+ },
+ {
+ /* 4 */
+ 159, 67, 128, 34
+ },
+ {
+ /* 5 */
+ 234, 94, 128, 28
+ },
+};
--- /dev/null
+++ b/vp9/common/modecont.h
@@ -1,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECONT_H
+#define __INC_MODECONT_H
+
+extern const int vp9_default_mode_contexts[6][4];
+extern const int vp9_default_mode_contexts_a[6][4];
+#endif
--- /dev/null
+++ b/vp9/common/modecontext.c
@@ -1,0 +1,145 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymode.h"
+
+const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {
+ {
+ /*Above Mode : 0*/
+ { 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, /* left_mode 0 */
+ { 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, /* left_mode 1 */
+ { 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, /* left_mode 2 */
+ { 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, /* left_mode 3 */
+ { 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, /* left_mode 4 */
+ { 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, /* left_mode 5 */
+ { 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, /* left_mode 6 */
+ { 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, /* left_mode 7 */
+ { 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, /* left_mode 8 */
+ { 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 1*/
+ { 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, /* left_mode 0 */
+ { 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, /* left_mode 1 */
+ { 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, /* left_mode 2 */
+ { 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, /* left_mode 3 */
+ { 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, /* left_mode 4 */
+ { 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, /* left_mode 5 */
+ { 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, /* left_mode 6 */
+ { 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, /* left_mode 7 */
+ { 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, /* left_mode 8 */
+ { 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 2*/
+ { 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, /* left_mode 0 */
+ { 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, /* left_mode 1 */
+ { 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, /* left_mode 2 */
+ { 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, /* left_mode 3 */
+ { 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, /* left_mode 4 */
+ { 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, /* left_mode 5 */
+ { 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, /* left_mode 6 */
+ { 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, /* left_mode 7 */
+ { 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, /* left_mode 8 */
+ { 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 3*/
+ { 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, /* left_mode 0 */
+ { 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, /* left_mode 1 */
+ { 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, /* left_mode 2 */
+ { 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, /* left_mode 3 */
+ { 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, /* left_mode 4 */
+ { 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, /* left_mode 5 */
+ { 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, /* left_mode 6 */
+ { 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, /* left_mode 7 */
+ { 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, /* left_mode 8 */
+ { 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 4*/
+ { 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, /* left_mode 0 */
+ { 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, /* left_mode 1 */
+ { 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, /* left_mode 2 */
+ { 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, /* left_mode 3 */
+ { 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, /* left_mode 4 */
+ { 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, /* left_mode 5 */
+ { 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, /* left_mode 6 */
+ { 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, /* left_mode 7 */
+ { 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, /* left_mode 8 */
+ { 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 5*/
+ { 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, /* left_mode 0 */
+ { 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, /* left_mode 1 */
+ { 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, /* left_mode 2 */
+ { 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, /* left_mode 3 */
+ { 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, /* left_mode 4 */
+ { 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, /* left_mode 5 */
+ { 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, /* left_mode 6 */
+ { 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, /* left_mode 7 */
+ { 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, /* left_mode 8 */
+ { 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 6*/
+ { 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, /* left_mode 0 */
+ { 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, /* left_mode 1 */
+ { 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, /* left_mode 2 */
+ { 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, /* left_mode 3 */
+ { 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, /* left_mode 4 */
+ { 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, /* left_mode 5 */
+ { 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, /* left_mode 6 */
+ { 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, /* left_mode 7 */
+ { 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, /* left_mode 8 */
+ { 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 7*/
+ { 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, /* left_mode 0 */
+ { 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, /* left_mode 1 */
+ { 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, /* left_mode 2 */
+ { 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, /* left_mode 3 */
+ { 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, /* left_mode 4 */
+ { 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, /* left_mode 5 */
+ { 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, /* left_mode 6 */
+ { 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, /* left_mode 7 */
+ { 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, /* left_mode 8 */
+ { 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 8*/
+ { 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, /* left_mode 0 */
+ { 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, /* left_mode 1 */
+ { 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, /* left_mode 2 */
+ { 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, /* left_mode 3 */
+ { 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, /* left_mode 4 */
+ { 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, /* left_mode 5 */
+ { 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, /* left_mode 6 */
+ { 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, /* left_mode 7 */
+ { 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, /* left_mode 8 */
+ { 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, /* left_mode 9 */
+ },
+ {
+ /*Above Mode : 9*/
+ { 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, /* left_mode 0 */
+ { 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, /* left_mode 1 */
+ { 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, /* left_mode 2 */
+ { 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, /* left_mode 3 */
+ { 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, /* left_mode 4 */
+ { 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, /* left_mode 5 */
+ { 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, /* left_mode 6 */
+ { 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, /* left_mode 7 */
+ { 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, /* left_mode 8 */
+ { 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, /* left_mode 9 */
+ },
+};
--- /dev/null
+++ b/vp9/common/mv.h
@@ -1,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MV_H
+#define __INC_MV_H
+#include "vpx/vpx_integer.h"
+
+typedef struct {
+ short row;
+ short col;
+} MV;
+
+typedef union {
+ uint32_t as_int;
+ MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+#endif
--- /dev/null
+++ b/vp9/common/mvref_common.c
@@ -1,0 +1,342 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "mvref_common.h"
+
+#if CONFIG_NEWBESTREFMV
+
+#define MVREF_NEIGHBOURS 8
+static int mv_ref_search[MVREF_NEIGHBOURS][2] =
+ { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
+static int ref_distance_weight[MVREF_NEIGHBOURS] =
+ { 3,3,2,1,1,1,1,1 };
+
+// clamp_mv
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
+
+ if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
+ mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
+ else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
+ mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
+
+ if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
+ mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
+ else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
+ mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
+}
+
+
+// Gets a best matching candidate refenence motion vector
+// from the given mode info structure (if available)
+static int get_candidate_mvref(
+ const MODE_INFO *candidate_mi,
+ MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME *c_ref_frame,
+ int_mv *c_mv,
+ MV_REFERENCE_FRAME *c2_ref_frame,
+ int_mv *c2_mv
+) {
+
+ int ret_val = FALSE;
+ c2_mv->as_int = 0;
+ *c2_ref_frame = INTRA_FRAME;
+
+ // Target ref frame matches candidate first ref frame
+ if (ref_frame == candidate_mi->mbmi.ref_frame) {
+ c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+ *c_ref_frame = ref_frame;
+ ret_val = TRUE;
+
+ // Is there a second non zero vector we can use.
+ if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
+ (candidate_mi->mbmi.mv[1].as_int != 0) &&
+ (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
+ c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+ *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+ }
+
+ // Target ref frame matches candidate second ref frame
+ } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+ c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+ *c_ref_frame = ref_frame;
+ ret_val = TRUE;
+
+ // Is there a second non zero vector we can use.
+ if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&
+ (candidate_mi->mbmi.mv[0].as_int != 0) &&
+ (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
+ c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+ *c2_ref_frame = candidate_mi->mbmi.ref_frame;
+ }
+
+ // No ref frame matches so use first ref mv as first choice
+ } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
+ c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+ *c_ref_frame = candidate_mi->mbmi.ref_frame;
+ ret_val = TRUE;
+
+ // Is there a second non zero vector we can use.
+ if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&
+ (candidate_mi->mbmi.mv[1].as_int != 0) &&
+ (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
+ c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+ *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+ }
+
+ // If only the second ref mv is valid:- (Should not trigger in current code
+ // base given current possible compound prediction options).
+ } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
+ c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+ *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
+ ret_val = TRUE;
+ }
+
+ return ret_val;
+}
+
+// Performs mv adjustment based on reference frame and clamps the MV
+// if it goes off the edge of the buffer.
+static void scale_mv(
+ MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME this_ref_frame,
+ MV_REFERENCE_FRAME candidate_ref_frame,
+ int_mv *candidate_mv,
+ int *ref_sign_bias
+) {
+
+ if (candidate_ref_frame != this_ref_frame) {
+
+ //int frame_distances[MAX_REF_FRAMES];
+ //int last_distance = 1;
+ //int gf_distance = xd->frames_since_golden;
+ //int arf_distance = xd->frames_till_alt_ref_frame;
+
+ // Sign inversion where appropriate.
+ if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+ candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+ candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+ }
+
+ // Scale based on frame distance if the reference frames not the same.
+ /*frame_distances[INTRA_FRAME] = 1; // should never be used
+ frame_distances[LAST_FRAME] = 1;
+ frame_distances[GOLDEN_FRAME] =
+ (xd->frames_since_golden) ? xd->frames_since_golden : 1;
+ frame_distances[ALTREF_FRAME] =
+ (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
+
+ if (frame_distances[this_ref_frame] &&
+ frame_distances[candidate_ref_frame]) {
+ candidate_mv->as_mv.row =
+ (short)(((int)(candidate_mv->as_mv.row) *
+ frame_distances[this_ref_frame]) /
+ frame_distances[candidate_ref_frame]);
+
+ candidate_mv->as_mv.col =
+ (short)(((int)(candidate_mv->as_mv.col) *
+ frame_distances[this_ref_frame]) /
+ frame_distances[candidate_ref_frame]);
+ }
+ */
+ }
+
+ // Clamp the MV so it does not point out of the frame buffer
+ clamp_mv(xd, candidate_mv);
+}
+
+// Adds a new candidate reference vector to the list if indeed it is new.
+// If it is not new then the score of the existing candidate that it matches
+// is increased and the list is resorted.
+static void addmv_and_shuffle(
+ int_mv *mv_list,
+ int *mv_scores,
+ int *index,
+ int_mv candidate_mv,
+ int weight
+) {
+
+ int i = *index;
+ int duplicate_found = FALSE;
+
+ // Check for duplicates. If there is one increment its score.
+ // Duplicate defined as being the same full pel vector with rounding.
+ while (i > 0) {
+ i--;
+
+ if (candidate_mv.as_int == mv_list[i].as_int) {
+ duplicate_found = TRUE;
+ mv_scores[i] += weight;
+ break;
+ }
+ }
+
+ // If no duplicate was found add the new vector and give it a weight
+ if (!duplicate_found) {
+ mv_list[*index].as_int = candidate_mv.as_int;
+ mv_scores[*index] = weight;
+ i = *index;
+ (*index)++;
+ }
+
+ // Reshuffle the list so that highest scoring mvs at the top.
+ while (i > 0) {
+ if (mv_scores[i] > mv_scores[i-1]) {
+ int tmp_score = mv_scores[i-1];
+ int_mv tmp_mv = mv_list[i-1];
+
+ mv_scores[i-1] = mv_scores[i];
+ mv_list[i-1] = mv_list[i];
+ mv_scores[i] = tmp_score;
+ mv_list[i] = tmp_mv;
+ i--;
+ } else
+ break;
+ }
+}
+
+// This function searches the neighbourhood of a given MB/SB and populates a
+// list of candidate reference vectors.
+//
+void vp9_find_mv_refs(
+ MACROBLOCKD *xd,
+ MODE_INFO *here,
+ MODE_INFO *lf_here,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *mv_ref_list,
+ int *ref_sign_bias
+) {
+
+ int i;
+ MODE_INFO *candidate_mi;
+ int_mv candidate_mvs[MAX_MV_REFS];
+ int_mv c_refmv;
+ MV_REFERENCE_FRAME c_ref_frame;
+ int_mv c2_refmv;
+ MV_REFERENCE_FRAME c2_ref_frame;
+ int candidate_scores[MAX_MV_REFS];
+ int index = 0;
+ int ref_weight = 0;
+ int valid_mv_ref;
+
+ // Blank the reference vector lists and other local structures.
+ vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
+ vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+ vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
+
+ // Populate a list with candidate reference vectors from the
+ // spatial neighbours.
+ for (i = 0; i < 2; ++i) {
+ if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+ ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+ candidate_mi = here + mv_ref_search[i][0] +
+ (mv_ref_search[i][1] * xd->mode_info_stride);
+
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv,
+ &c2_ref_frame, &c2_refmv);
+
+ // If there is a valid MV candidate then add it to the list
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c_ref_frame == ref_frame) << 4);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+
+ // If there is a second valid mv then add it as well.
+ if (c2_ref_frame != INTRA_FRAME) {
+ scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c2_ref_frame == ref_frame) << 4);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c2_refmv, ref_weight);
+ }
+ }
+ }
+ }
+
+ // Look at the corresponding vector in the last frame
+ candidate_mi = lf_here;
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv,
+ &c2_ref_frame, &c2_refmv);
+
+ // If there is a valid MV candidate then add it to the list
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+
+ // If there is a second valid mv then add it as well.
+ if (c2_ref_frame != INTRA_FRAME) {
+ scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c2_ref_frame == ref_frame) << 4);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c2_refmv, ref_weight);
+ }
+ }
+
+ // Populate a list with candidate reference vectors from the
+ // spatial neighbours.
+ for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
+ if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+ ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+ candidate_mi = here + mv_ref_search[i][0] +
+ (mv_ref_search[i][1] * xd->mode_info_stride);
+
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv,
+ &c2_ref_frame, &c2_refmv);
+
+ // If there is a valid MV candidate then add it to the list
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c_ref_frame == ref_frame) << 4);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+
+ // If there is a second valid mv then add it as well.
+ if (c2_ref_frame != INTRA_FRAME) {
+ scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c2_ref_frame == ref_frame) << 4);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c2_refmv, ref_weight);
+ }
+ }
+ }
+ }
+
+ // 0,0 is always a valid reference.
+ for (i = 0; i < index; ++i)
+ if (candidate_mvs[i].as_int == 0)
+ break;
+ if (i == index) {
+ c_refmv.as_int = 0;
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, candidate_scores[3]+1 );
+ }
+
+ // Copy over the candidate list.
+ vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/mvref_common.h
@@ -1,0 +1,31 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyxc_int.h"
+#include "blockd.h"
+
+// MR reference entropy header file.
+#if CONFIG_NEWBESTREFMV
+
+#ifndef __INC_MVREF_COMMON_H
+#define __INC_MVREF_COMMON_H
+
+void vp9_find_mv_refs(
+ MACROBLOCKD *xd,
+ MODE_INFO *here,
+ MODE_INFO *lf_here,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv * mv_ref_list,
+ int *ref_sign_bias
+);
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/onyx.h
@@ -1,0 +1,225 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYX_H
+#define __INC_ONYX_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx_scale/yv12config.h"
+#include "type_aliases.h"
+#include "ppflags.h"
+ typedef int *VP9_PTR;
+
+ /* Create/destroy static data structures. */
+
+ typedef enum {
+ NORMAL = 0,
+ FOURFIVE = 1,
+ THREEFIVE = 2,
+ ONETWO = 3
+
+ } VPX_SCALING;
+
+ typedef enum {
+ VP9_LAST_FLAG = 1,
+ VP9_GOLD_FLAG = 2,
+ VP9_ALT_FLAG = 4
+ } VP9_REFFRAME;
+
+
+ typedef enum {
+ USAGE_STREAM_FROM_SERVER = 0x0,
+ USAGE_LOCAL_FILE_PLAYBACK = 0x1,
+ USAGE_CONSTRAINED_QUALITY = 0x2
+ } END_USAGE;
+
+
+ typedef enum {
+ MODE_GOODQUALITY = 0x1,
+ MODE_BESTQUALITY = 0x2,
+ MODE_FIRSTPASS = 0x3,
+ MODE_SECONDPASS = 0x4,
+ MODE_SECONDPASS_BEST = 0x5,
+ } MODE;
+
+ typedef enum {
+ FRAMEFLAGS_KEY = 1,
+ FRAMEFLAGS_GOLDEN = 2,
+ FRAMEFLAGS_ALTREF = 4,
+ } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+ static __inline void Scale2Ratio(int mode, int *hr, int *hs) {
+ switch (mode) {
+ case NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+ }
+
+ typedef struct {
+ int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
+ int Width; // width of data passed to the compressor
+ int Height; // height of data passed to the compressor
+ double frame_rate; // set to passed in framerate
+ int target_bandwidth; // bandwidth to be used in kilobits per second
+
+ int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
+ int Sharpness; // parameter used for sharpening output: recommendation 0:
+ int cpu_used;
+ unsigned int rc_max_intra_bitrate_pct;
+
+ // mode ->
+ // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
+ // a television signal or feed from a live camera). ( speed setting controls how fast )
+ // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
+ // encode the output. ( speed setting controls how fast )
+ // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
+ // speed. The output is compressed at the highest possible quality. This option takes the longest
+ // amount of time to encode. ( speed setting ignored )
+ // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
+ // pass. ( speed setting controls how fast )
+ // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
+ // pass to create the compressed output. ( speed setting controls how fast )
+ // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first
+ // encoding pass to create the compressed output using the highest possible quality, and taking a
+ // longer amount of time to encode.. ( speed setting ignored )
+ int Mode; //
+
+ // Key Framing Operations
+ int auto_key; // automatically detect cut scenes and set the keyframes
+ int key_freq; // maximum distance to key frame.
+
+ int allow_lag; // allow lagged compression (if 0 lagin frames is ignored)
+ int lag_in_frames; // how many frames lag before we start encoding
+
+ // ----------------------------------------------------------------
+ // DATARATE CONTROL OPTIONS
+
+ int end_usage; // vbr or cbr
+
+ // buffer targeting aggressiveness
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ // buffering parameters
+ int starting_buffer_level; // in seconds
+ int optimal_buffer_level;
+ int maximum_buffer_size;
+
+ // controlling quality
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+ int lossless;
+
+ // two pass datarate control
+ int two_pass_vbrbias; // two pass datarate control tweaks
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+ // END DATARATE CONTROL OPTIONS
+ // ----------------------------------------------------------------
+
+
+ // these parameters aren't to be used in final build don't use!!!
+ int play_alternate;
+ int alt_freq;
+
+ int encode_breakout; // early breakout encode threshold : for video conf recommend 800
+
+ int arnr_max_frames;
+ int arnr_strength;
+ int arnr_type;
+
+ struct vpx_fixed_buf two_pass_stats_in;
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+ vp8e_tuning tuning;
+ } VP9_CONFIG;
+
+
+ void vp9_initialize_enc();
+
+ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
+ void vp9_remove_compressor(VP9_PTR *comp);
+
+ void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
+
+// receive a frames worth of data caller can assume that a copy of this frame is made
+// and not just a copy of the pointer..
+ int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time_stamp);
+
+ int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
+ unsigned long *size, unsigned char *dest,
+ int64_t *time_stamp, int64_t *time_end,
+ int flush);
+
+ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *flags);
+
+ int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
+
+ int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
+
+ int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+ int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+ int vp9_update_entropy(VP9_PTR comp, int update);
+
+ int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[4], int delta_lf[4],
+ unsigned int threshold[4]);
+
+ int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
+ unsigned int rows, unsigned int cols);
+
+ int vp9_set_internal_size(VP9_PTR comp,
+ VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+
+ int vp9_get_quantizer(VP9_PTR c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __INC_ONYX_H
--- /dev/null
+++ b/vp9/common/onyxc_int.h
@@ -1,0 +1,314 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXC_INT_H
+#define __INC_ONYXC_INT_H
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_rtcd.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#include "entropymode.h"
+#include "idct.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+/* Create/destroy static data structures. */
+
+void vp9_initialize_common(void);
+
+#define MINQ 0
+
+#define MAXQ 255
+#define QINDEX_BITS 8
+
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define COMP_PRED_CONTEXTS 2
+
+typedef struct frame_contexts {
+ vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
+ vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+ vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+ vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
+ vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+ vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
+ vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+ nmv_context nmvc;
+ nmv_context pre_nmvc;
+ vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];
+ vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+ vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+ vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];
+ vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+ vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];
+ unsigned int bmode_counts [VP9_BINTRAMODES];
+ unsigned int ymode_counts [VP9_YMODES]; /* interframe intra mode probs */
+ unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];
+ unsigned int i8x8_mode_counts [VP9_I8X8_MODES]; /* interframe intra mode probs */
+ unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];
+ unsigned int mbsplit_counts [VP9_NUMMBSPLITS];
+
+ vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+ vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+ vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+ unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+ unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+ unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+ unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+ unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+ unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
+ [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+ nmv_context_counts NMVcount;
+ vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS - 1];
+
+ int mode_context[6][4];
+ int mode_context_a[6][4];
+ int vp8_mode_contexts[6][4];
+ int mv_ref_ct[6][4][2];
+ int mv_ref_ct_a[6][4][2];
+} FRAME_CONTEXT;
+
+typedef enum {
+ RECON_CLAMP_REQUIRED = 0,
+ RECON_CLAMP_NOTREQUIRED = 1
+} CLAMP_TYPE;
+
+typedef enum {
+ SINGLE_PREDICTION_ONLY = 0,
+ COMP_PREDICTION_ONLY = 1,
+ HYBRID_PREDICTION = 2,
+ NB_PREDICTION_TYPES = 3,
+} COMPPREDMODE_TYPE;
+
+typedef enum {
+ ONLY_4X4 = 0,
+ ALLOW_8X8 = 1,
+ ALLOW_16X16 = 2,
+ TX_MODE_SELECT = 3,
+ NB_TXFM_MODES = 4,
+} TXFM_MODE;
+
+typedef struct VP9_COMMON_RTCD {
+#if CONFIG_RUNTIME_CPU_DETECT
+ vp9_idct_rtcd_vtable_t idct;
+ vp9_subpix_rtcd_vtable_t subpix;
+#if CONFIG_POSTPROC
+ vp9_postproc_rtcd_vtable_t postproc;
+#endif
+ int flags;
+#else
+ int unused;
+#endif
+} VP9_COMMON_RTCD;
+
+typedef struct VP9Common {
+ struct vpx_internal_error_info error;
+
+ DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+
+ int Width;
+ int Height;
+ int horiz_scale;
+ int vert_scale;
+
+ YUV_TYPE clr_type;
+ CLAMP_TYPE clamp_type;
+
+ YV12_BUFFER_CONFIG *frame_to_show;
+
+ YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+ int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+ int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+ YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG temp_scale_frame;
+
+
+ FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
+ FRAME_TYPE frame_type;
+
+ int show_frame;
+
+ int frame_flags;
+ int MBs;
+ int mb_rows;
+ int mb_cols;
+ int mode_info_stride;
+
+ /* profile settings */
+ int experimental;
+ int mb_no_coeff_skip;
+ TXFM_MODE txfm_mode;
+ COMPPREDMODE_TYPE comp_pred_mode;
+ int no_lpf;
+ int use_bilinear_mc_filter;
+ int full_pixel;
+
+ int base_qindex;
+ int last_kf_gf_q; /* Q used on the last GF or KF */
+
+ int y1dc_delta_q;
+ int y2dc_delta_q;
+ int y2ac_delta_q;
+ int uvdc_delta_q;
+ int uvac_delta_q;
+
+ unsigned int frames_since_golden;
+ unsigned int frames_till_alt_ref_frame;
+
+ /* We allocate a MODE_INFO struct for each macroblock, together with
+ an extra row on top and column on the left to simplify prediction. */
+
+ MODE_INFO *mip; /* Base of allocated array */
+ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+ MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+
+
+ // Persistent mb segment id map used in prediction.
+ unsigned char *last_frame_seg_map;
+
+ INTERPOLATIONFILTERTYPE mcomp_filter_type;
+ LOOPFILTERTYPE filter_type;
+
+ loop_filter_info_n lf_info;
+
+ int filter_level;
+ int last_sharpness_level;
+ int sharpness_level;
+
+ int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
+
+ int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
+ int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+ int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
+
+ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
+ ENTROPY_CONTEXT_PLANES left_context[2]; /* (up to) 4 contexts "" */
+
+ /* keyframe block modes are predicted by their above, left neighbors */
+
+ vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];
+ vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
+#if CONFIG_SUPERBLOCKS
+ vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
+#endif
+ int kf_ymode_probs_index;
+ int kf_ymode_probs_update;
+ vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
+
+ vp9_prob prob_intra_coded;
+ vp9_prob prob_last_coded;
+ vp9_prob prob_gf_coded;
+#if CONFIG_SUPERBLOCKS
+ vp9_prob sb_coded;
+#endif
+
+ // Context probabilities when using predictive coding of segment id
+ vp9_prob segment_pred_probs[PREDICTION_PROBS];
+ unsigned char temporal_update;
+
+ // Context probabilities for reference frame prediction
+ unsigned char ref_scores[MAX_REF_FRAMES];
+ vp9_prob ref_pred_probs[PREDICTION_PROBS];
+ vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];
+
+ vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
+
+ // FIXME contextualize
+ vp9_prob prob_tx[TX_SIZE_MAX - 1];
+
+ vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
+
+ FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+ FRAME_CONTEXT lfc; /* last frame entropy */
+ FRAME_CONTEXT fc; /* this frame entropy */
+
+ // int mv_ref_ct[6][4][2];
+ // int mv_ref_ct_a[6][4][2];
+ // int mode_context[6][4];
+ // int mode_context_a[6][4];
+ // int vp8_mode_contexts[6][4];
+
+ unsigned int current_video_frame;
+ int near_boffset[3];
+ int version;
+
+#ifdef PACKET_TESTING
+ VP9_HEADER oh;
+#endif
+ double bitrate;
+ double framerate;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP9_COMMON_RTCD rtcd;
+#endif
+
+#if CONFIG_POSTPROC
+ struct postproc_state postproc_state;
+#endif
+
+#if CONFIG_PRED_FILTER
+ /* Prediction filter variables */
+ int pred_filter_mode; // 0=disabled at the frame level (no MB filtered)
+ // 1=enabled at the frame level (all MB filtered)
+ // 2=specified per MB (1=filtered, 0=non-filtered)
+ vp9_prob prob_pred_filter_off;
+#endif
+
+} VP9_COMMON;
+
+#endif // __INC_ONYX_INT_H
--- /dev/null
+++ b/vp9/common/onyxd.h
@@ -1,0 +1,68 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXD_H
+#define __INC_ONYXD_H
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "type_aliases.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+
+ typedef void *VP9D_PTR;
+ typedef struct {
+ int Width;
+ int Height;
+ int Version;
+ int postprocess;
+ int max_threads;
+ int input_partition;
+ } VP9D_CONFIG;
+ typedef enum {
+ VP9_LAST_FLAG = 1,
+ VP9_GOLD_FLAG = 2,
+ VP9_ALT_FLAG = 4
+ } VP9_REFFRAME;
+
+ void vp9_initialize_dec(void);
+
+ int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
+ const unsigned char *dest,
+ int64_t time_stamp);
+
+ int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
+ int64_t *time_stamp, int64_t *time_end_stamp,
+ vp9_ppflags_t *flags);
+
+ vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
+ VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
+ VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+
+ void vp9_remove_decompressor(VP9D_PTR comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __INC_ONYXD_H
--- /dev/null
+++ b/vp9/common/postproc.c
@@ -1,0 +1,1035 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define RGB_TO_YUV(t) \
+ ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \
+ (0.098*(float)(t & 0xff)) + 16), \
+ (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \
+ (0.439*(float)(t & 0xff)) + 128), \
+ ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \
+ (0.071*(float)(t & 0xff)) + 128)
+
+/* global constants */
+#if CONFIG_POSTPROC_VISUALIZER
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
+ { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
+ { RGB_TO_YUV(0x00FF00) }, /* Green */
+ { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
+ { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
+ { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
+ { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
+ { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
+ { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
+ { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
+ { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
+ { RGB_TO_YUV(0x006400) }, /* DarkGreen */
+ { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
+ { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
+ { RGB_TO_YUV(0x00008B) }, /* Dark blue */
+ { RGB_TO_YUV(0x551A8B) }, /* Purple */
+ { RGB_TO_YUV(0xFF0000) } /* Red */
+ { RGB_TO_YUV(0xCC33FF) }, /* Magenta */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
+ { RGB_TO_YUV(0x6633ff) }, /* Purple */
+ { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
+ { RGB_TO_YUV(0xff33cc) }, /* Pink */
+ { RGB_TO_YUV(0xff3366) }, /* Coral */
+ { RGB_TO_YUV(0x3366ff) }, /* Blue */
+ { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
+ { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
+ { RGB_TO_YUV(0xff6633) }, /* Orange */
+ { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
+ { RGB_TO_YUV(0x8ab800) }, /* Green */
+ { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
+ { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
+ { RGB_TO_YUV(0x66ff33) }, /* Light Green */
+ { RGB_TO_YUV(0xccff33) }, /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
+ { RGB_TO_YUV(0x00ff00) }, /* Blue */
+ { RGB_TO_YUV(0x0000ff) }, /* Green */
+ { RGB_TO_YUV(0xffff00) }, /* Yellow */
+ { RGB_TO_YUV(0xff0000) }, /* Red */
+};
+#endif
+
+static const short kernel5[] = {
+ 1, 1, 4, 1, 1
+};
+
+const short vp9_rv[] = {
+ 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+ 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+ 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+ 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+ 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+ 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+ 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+ 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+ 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+ 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+ 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+ 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+ 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+ 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+ 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+ 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+ 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+ 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+ 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+ 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+ 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+ 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+ 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+ 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+ 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+ 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+ 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+ 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+ 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+ 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+ 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+ 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+ 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+ 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+ 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+ 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+
+extern void vp9_blit_text(const char *msg, unsigned char *address,
+ const int pitch);
+extern void vp9_blit_line(int x0, int x1, int y0, int y1,
+ unsigned char *image, const int pitch);
+/****************************************************************************
+ */
+void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int rows,
+ int cols,
+ int flimit) {
+ unsigned char *p_src, *p_dst;
+ int row;
+ int col;
+ int i;
+ int v;
+ int pitch = src_pixels_per_line;
+ unsigned char d[8];
+ (void)dst_pixels_per_line;
+
+ for (row = 0; row < rows; row++) {
+ /* post_proc_down for one row */
+ p_src = src_ptr;
+ p_dst = dst_ptr;
+
+ for (col = 0; col < cols; col++) {
+
+ int kernel = 4;
+ int v = p_src[col];
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - p_src[col + i * pitch]) > flimit)
+ goto down_skip_convolve;
+
+ kernel += kernel5[2 + i] * p_src[col + i * pitch];
+ }
+
+ v = (kernel >> 3);
+ down_skip_convolve:
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst_ptr;
+ p_dst = dst_ptr;
+
+ for (i = 0; i < 8; i++)
+ d[i] = p_src[i];
+
+ for (col = 0; col < cols; col++) {
+ int kernel = 4;
+ v = p_src[col];
+
+ d[col & 7] = v;
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - p_src[col + i]) > flimit)
+ goto across_skip_convolve;
+
+ kernel += kernel5[2 + i] * p_src[col + i];
+ }
+
+ d[col & 7] = (kernel >> 3);
+ across_skip_convolve:
+
+ if (col >= 2)
+ p_dst[col - 2] = d[(col - 2) & 7];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 7];
+ p_dst[col - 1] = d[(col - 1) & 7];
+
+
+ /* next row */
+ src_ptr += pitch;
+ dst_ptr += pitch;
+ }
+}
+
+static int q2mbl(int x) {
+ if (x < 20) x = 20;
+
+ x = 50 + (x - 50) * 10 / 8;
+ return x * x / 3;
+}
+
+void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
+ int rows, int cols, int flimit) {
+ int r, c, i;
+
+ unsigned char *s = src;
+ unsigned char d[16];
+
+
+ for (r = 0; r < rows; r++) {
+ int sumsq = 0;
+ int sum = 0;
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i + 8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++) {
+ int x = s[c + 7] - s[c - 8];
+ int y = s[c + 7] + s[c - 8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c & 15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[c & 15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c - 8] = d[(c - 8) & 15];
+ }
+
+ s += pitch;
+ }
+}
+
+void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
+ int rows, int cols, int flimit) {
+ int r, c, i;
+ const short *rv3 = &vp9_rv[63 & rand()];
+
+ for (c = 0; c < cols; c++) {
+ unsigned char *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ unsigned char d[16];
+ const short *rv2 = rv3 + ((c * 17) & 127);
+
+ for (i = -8; i <= 6; i++) {
+ sumsq += s[i * pitch] * s[i * pitch];
+ sum += s[i * pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++) {
+ sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+ sum += s[7 * pitch] - s[-8 * pitch];
+ d[r & 15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit) {
+ d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ }
+
+ s[-8 * pitch] = d[(r - 8) & 15];
+ s += pitch;
+ }
+ }
+}
+
+static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag,
+ vp9_postproc_rtcd_vtable_t *rtcd) {
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+ (void) low_var_thresh;
+ (void) flag;
+
+ POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
+ source->y_stride, post->y_stride,
+ source->y_height, source->y_width, ppl);
+ POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,
+ post->y_height, post->y_width, q2mbl(q));
+ POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,
+ post->y_height, post->y_width, q2mbl(q));
+
+ POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+ POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+}
+
+void vp9_deblock(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag,
+ vp9_postproc_rtcd_vtable_t *rtcd) {
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+ (void) low_var_thresh;
+ (void) flag;
+
+ POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,
+ source->y_stride, post->y_stride,
+ source->y_height, source->y_width, ppl);
+ POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+ POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,
+ source->uv_stride, post->uv_stride,
+ source->uv_height, source->uv_width, ppl);
+}
+
+void vp9_de_noise(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag,
+ vp9_postproc_rtcd_vtable_t *rtcd) {
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+ (void) post;
+ (void) low_var_thresh;
+ (void) flag;
+
+ POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,
+ src->y_buffer + 2 * src->y_stride + 2,
+ src->y_stride,
+ src->y_stride,
+ src->y_height - 4,
+ src->y_width - 4,
+ ppl);
+ POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,
+ src->u_buffer + 2 * src->uv_stride + 2,
+ src->uv_stride,
+ src->uv_stride,
+ src->uv_height - 4,
+ src->uv_width - 4, ppl);
+ POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,
+ src->v_buffer + 2 * src->uv_stride + 2,
+ src->uv_stride,
+ src->uv_stride,
+ src->uv_height - 4,
+ src->uv_width - 4, ppl);
+}
+
+double vp9_gaussian(double sigma, double mu, double x) {
+ return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+ (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a) {
+ char char_dist[300];
+
+ double sigma;
+ int ai = a, qi = q, i;
+
+ vp9_clear_system_state();
+
+ sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+ /* set up a lookup table of 256 entries that matches
+ * a gaussian distribution with sigma determined by q.
+ */
+ {
+ double i;
+ int next, j;
+
+ next = 0;
+
+ for (i = -32; i < 32; i++) {
+ int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));
+
+ if (a) {
+ for (j = 0; j < a; j++) {
+ char_dist[next + j] = (char) i;
+ }
+
+ next = next + j;
+ }
+
+ }
+
+ for (next = next; next < 256; next++)
+ char_dist[next] = 0;
+ }
+
+ for (i = 0; i < 3072; i++) {
+ state->noise[i] = char_dist[rand() & 0xff];
+ }
+
+ for (i = 0; i < 16; i++) {
+ state->blackclamp[i] = -char_dist[0];
+ state->whiteclamp[i] = -char_dist[0];
+ state->bothclamp[i] = -2 * char_dist[0];
+ }
+
+ state->last_q = q;
+ state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : plane_add_noise_c
+ *
+ * INPUTS : unsigned char *Start starting address of buffer to
+ * add gaussian noise to
+ * unsigned int Width width of plane
+ * unsigned int Height height of plane
+ * int Pitch distance between subsequent lines of frame
+ * int q quantizer used to determine amount of noise
+ * to add
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void.
+ *
+ * FUNCTION : adds gaussian noise to a plane of pixels
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
+ char blackclamp[16],
+ char whiteclamp[16],
+ char bothclamp[16],
+ unsigned int Width, unsigned int Height, int Pitch) {
+ unsigned int i, j;
+
+ for (i = 0; i < Height; i++) {
+ unsigned char *Pos = Start + i * Pitch;
+ char *Ref = (char *)(noise + (rand() & 0xff));
+
+ for (j = 0; j < Width; j++) {
+ if (Pos[j] < blackclamp[0])
+ Pos[j] = blackclamp[0];
+
+ if (Pos[j] > 255 + whiteclamp[0])
+ Pos[j] = 255 + whiteclamp[0];
+
+ Pos[j] += Ref[j];
+ }
+ }
+}
+
+/* Blend the macro block with a solid colored square. Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride) {
+ int i, j;
+ int y1_const = y1 * ((1 << 16) - alpha);
+ int u1_const = u1 * ((1 << 16) - alpha);
+ int v1_const = v1 * ((1 << 16) - alpha);
+
+ y += 2 * stride + 2;
+ for (i = 0; i < 12; i++) {
+ for (j = 0; j < 12; j++) {
+ y[j] = (y[j] * alpha + y1_const) >> 16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ u += stride + 1;
+ v += stride + 1;
+
+ for (i = 0; i < 6; i++) {
+ for (j = 0; j < 6; j++) {
+ u[j] = (u[j] * alpha + u1_const) >> 16;
+ v[j] = (v[j] * alpha + v1_const) >> 16;
+ }
+ u += stride;
+ v += stride;
+ }
+}
+
+/* Blend only the edge of the macro block. Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride) {
+ int i, j;
+ int y1_const = y1 * ((1 << 16) - alpha);
+ int u1_const = u1 * ((1 << 16) - alpha);
+ int v1_const = v1 * ((1 << 16) - alpha);
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 16; j++) {
+ y[j] = (y[j] * alpha + y1_const) >> 16;
+ }
+ y += stride;
+ }
+
+ for (i = 0; i < 12; i++) {
+ y[0] = (y[0] * alpha + y1_const) >> 16;
+ y[1] = (y[1] * alpha + y1_const) >> 16;
+ y[14] = (y[14] * alpha + y1_const) >> 16;
+ y[15] = (y[15] * alpha + y1_const) >> 16;
+ y += stride;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 16; j++) {
+ y[j] = (y[j] * alpha + y1_const) >> 16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ for (j = 0; j < 8; j++) {
+ u[j] = (u[j] * alpha + u1_const) >> 16;
+ v[j] = (v[j] * alpha + v1_const) >> 16;
+ }
+ u += stride;
+ v += stride;
+
+ for (i = 0; i < 6; i++) {
+ u[0] = (u[0] * alpha + u1_const) >> 16;
+ v[0] = (v[0] * alpha + v1_const) >> 16;
+
+ u[7] = (u[7] * alpha + u1_const) >> 16;
+ v[7] = (v[7] * alpha + v1_const) >> 16;
+
+ u += stride;
+ v += stride;
+ }
+
+ for (j = 0; j < 8; j++) {
+ u[j] = (u[j] * alpha + u1_const) >> 16;
+ v[j] = (v[j] * alpha + v1_const) >> 16;
+ }
+}
+
+void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride) {
+ int i, j;
+ int y1_const = y1 * ((1 << 16) - alpha);
+ int u1_const = u1 * ((1 << 16) - alpha);
+ int v1_const = v1 * ((1 << 16) - alpha);
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ y[j] = (y[j] * alpha + y1_const) >> 16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ u[j] = (u[j] * alpha + u1_const) >> 16;
+ v[j] = (v[j] * alpha + v1_const) >> 16;
+ }
+ u += stride;
+ v += stride;
+ }
+}
+
+static void constrain_line(int x0, int *x1, int y0, int *y1,
+ int width, int height) {
+ int dx;
+ int dy;
+
+ if (*x1 > width) {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *x1 = width;
+ if (dx)
+ *y1 = ((width - x0) * dy) / dx + y0;
+ }
+ if (*x1 < 0) {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *x1 = 0;
+ if (dx)
+ *y1 = ((0 - x0) * dy) / dx + y0;
+ }
+ if (*y1 > height) {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *y1 = height;
+ if (dy)
+ *x1 = ((height - y0) * dx) / dy + x0;
+ }
+ if (*y1 < 0) {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *y1 = 0;
+ if (dy)
+ *x1 = ((0 - y0) * dx) / dy + x0;
+ }
+}
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)
+#else
+#define RTCD_VTABLE(oci) NULL
+#endif
+
+int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *ppflags) {
+ int q = oci->filter_level * 10 / 6;
+ int flags = ppflags->post_proc_flag;
+ int deblock_level = ppflags->deblocking_level;
+ int noise_level = ppflags->noise_level;
+
+ if (!oci->frame_to_show)
+ return -1;
+
+ if (q > 63)
+ q = 63;
+
+ if (!flags) {
+ *dest = *oci->frame_to_show;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+ return 0;
+
+ }
+
+#if ARCH_X86||ARCH_X86_64
+ vpx_reset_mmx_state();
+#endif
+
+ if (flags & VP9D_DEMACROBLOCK) {
+ deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10, 1, 0,
+ RTCD_VTABLE(oci));
+ } else if (flags & VP9D_DEBLOCK) {
+ vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,
+ q, 1, 0, RTCD_VTABLE(oci));
+ } else {
+ vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);
+ }
+
+ if (flags & VP9D_ADDNOISE) {
+ if (oci->postproc_state.last_q != q
+ || oci->postproc_state.last_noise != noise_level) {
+ fillrd(&oci->postproc_state, 63 - q, noise_level);
+ }
+
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,
+ oci->postproc_state.noise,
+ oci->postproc_state.blackclamp,
+ oci->postproc_state.whiteclamp,
+ oci->postproc_state.bothclamp,
+ oci->post_proc_buffer.y_width,
+ oci->post_proc_buffer.y_height,
+ oci->post_proc_buffer.y_stride);
+ }
+
+#if CONFIG_POSTPROC_VISUALIZER
+ if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
+ char message[512];
+ sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+ (oci->frame_type == KEY_FRAME),
+ oci->refresh_golden_frame,
+ oci->base_qindex,
+ oci->filter_level,
+ flags,
+ oci->mb_cols, oci->mb_rows);
+ vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+ oci->post_proc_buffer.y_stride);
+ }
+
+ if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
+ int i, j;
+ unsigned char *y_ptr;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int mb_rows = post->y_height >> 4;
+ int mb_cols = post->y_width >> 4;
+ int mb_index = 0;
+ MODE_INFO *mi = oci->mi;
+
+ y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+ /* vp9_filter each macro block */
+ for (i = 0; i < mb_rows; i++) {
+ for (j = 0; j < mb_cols; j++) {
+ char zz[4];
+
+ sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+ vp9_blit_text(zz, y_ptr, post->y_stride);
+ mb_index++;
+ y_ptr += 16;
+ }
+
+ mb_index++; /* border */
+ y_ptr += post->y_stride * 16 - post->y_width;
+
+ }
+ }
+
+ if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
+ int i, j;
+ unsigned char *y_ptr;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int mb_rows = post->y_height >> 4;
+ int mb_cols = post->y_width >> 4;
+ int mb_index = 0;
+ MODE_INFO *mi = oci->mi;
+
+ y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+ /* vp9_filter each macro block */
+ for (i = 0; i < mb_rows; i++) {
+ for (j = 0; j < mb_cols; j++) {
+ char zz[4];
+ int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+ mi[mb_index].mbmi.mode != SPLITMV &&
+ mi[mb_index].mbmi.mb_skip_coeff);
+
+ if (oci->frame_type == KEY_FRAME)
+ sprintf(zz, "a");
+ else
+ sprintf(zz, "%c", dc_diff + '0');
+
+ vp9_blit_text(zz, y_ptr, post->y_stride);
+ mb_index++;
+ y_ptr += 16;
+ }
+
+ mb_index++; /* border */
+ y_ptr += post->y_stride * 16 - post->y_width;
+
+ }
+ }
+
+ if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
+ char message[512];
+ snprintf(message, sizeof(message),
+ "Bitrate: %10.2f frame_rate: %10.2f ",
+ oci->bitrate, oci->framerate);
+ vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
+ oci->post_proc_buffer.y_stride);
+ }
+
+ /* Draw motion vectors */
+ if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+ int x0, y0;
+
+ for (y0 = 0; y0 < height; y0 += 16) {
+ for (x0 = 0; x0 < width; x0 += 16) {
+ int x1, y1;
+
+ if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
+ mi++;
+ continue;
+ }
+
+ if (mi->mbmi.mode == SPLITMV) {
+ switch (mi->mbmi.partitioning) {
+ case PARTITIONING_16X8 : { /* mv_top_bottom */
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 8 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
+ vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[8];
+
+ x1 = x0 + 8 + (mv->col >> 3);
+ y1 = y0 + 12 + (mv->row >> 3);
+
+ constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
+ vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride);
+
+ break;
+ }
+ case PARTITIONING_8X16 : { /* mv_left_right */
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 + 8 + (mv->row >> 3);
+
+ constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
+ vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[2];
+
+ x1 = x0 + 12 + (mv->col >> 3);
+ y1 = y0 + 8 + (mv->row >> 3);
+
+ constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
+ vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride);
+
+ break;
+ }
+ case PARTITIONING_8X8 : { /* mv_quarters */
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
+ vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[2];
+
+ x1 = x0 + 12 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
+ vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[8];
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 + 12 + (mv->row >> 3);
+
+ constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
+ vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[10];
+
+ x1 = x0 + 12 + (mv->col >> 3);
+ y1 = y0 + 12 + (mv->row >> 3);
+
+ constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
+ vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride);
+ break;
+ }
+ case PARTITIONING_4X4:
+ default : {
+ union b_mode_info *bmi = mi->bmi;
+ int bx0, by0;
+
+ for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
+ for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = bx0 + 2 + (mv->col >> 3);
+ y1 = by0 + 2 + (mv->row >> 3);
+
+ constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
+ vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride);
+
+ bmi++;
+ }
+ }
+ }
+ }
+ } else if (mi->mbmi.mode >= NEARESTMV) {
+ MV *mv = &mi->mbmi.mv.as_mv;
+ const int lx0 = x0 + 8;
+ const int ly0 = y0 + 8;
+
+ x1 = lx0 + (mv->col >> 3);
+ y1 = ly0 + (mv->row >> 3);
+
+ if (x1 != lx0 && y1 != ly0) {
+ constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
+ vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride);
+
+ constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
+ vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
+ } else
+ vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
+ }
+
+ mi++;
+ }
+ mi++;
+ }
+ }
+
+ /* Color in block modes */
+ if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
+ && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
+ int y, x;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+ unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+ unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+
+ for (y = 0; y < height; y += 16) {
+ for (x = 0; x < width; x += 16) {
+ int Y = 0, U = 0, V = 0;
+
+ if (mi->mbmi.mode == B_PRED &&
+ ((ppflags->display_mb_modes_flag & B_PRED) ||
+ ppflags->display_b_modes_flag)) {
+ int by, bx;
+ unsigned char *yl, *ul, *vl;
+ union b_mode_info *bmi = mi->bmi;
+
+ yl = y_ptr + x;
+ ul = u_ptr + (x >> 1);
+ vl = v_ptr + (x >> 1);
+
+ for (by = 0; by < 16; by += 4) {
+ for (bx = 0; bx < 16; bx += 4) {
+ if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
+ || (ppflags->display_mb_modes_flag & B_PRED)) {
+ Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
+ U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
+ V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
+
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,
+ ul + (bx >> 1),
+ vl + (bx >> 1),
+ Y, U, V,
+ 0xc000, y_stride);
+ }
+ bmi++;
+ }
+
+ yl += y_stride * 4;
+ ul += y_stride * 1;
+ vl += y_stride * 1;
+ }
+ } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
+ Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+ U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+ V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,
+ u_ptr + (x >> 1),
+ v_ptr + (x >> 1),
+ Y, U, V,
+ 0xc000, y_stride);
+ }
+
+ mi++;
+ }
+ y_ptr += y_stride * 16;
+ u_ptr += y_stride * 4;
+ v_ptr += y_stride * 4;
+
+ mi++;
+ }
+ }
+
+ /* Color in frame reference blocks */
+ if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
+ ppflags->display_ref_frame_flag) {
+ int y, x;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+ unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+ unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+
+ for (y = 0; y < height; y += 16) {
+ for (x = 0; x < width; x += 16) {
+ int Y = 0, U = 0, V = 0;
+
+ if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
+ Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+ U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+ V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,
+ u_ptr + (x >> 1),
+ v_ptr + (x >> 1),
+ Y, U, V,
+ 0xc000, y_stride);
+ }
+
+ mi++;
+ }
+ y_ptr += y_stride * 16;
+ u_ptr += y_stride * 4;
+ v_ptr += y_stride * 4;
+
+ mi++;
+ }
+ }
+#endif
+
+ *dest = oci->post_proc_buffer;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+
+ return 0;
+}
--- /dev/null
+++ b/vp9/common/postproc.h
@@ -1,0 +1,128 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_H
+#define POSTPROC_H
+
+#define prototype_postproc_inplace(sym)\
+ void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+
+#define prototype_postproc(sym)\
+ void sym(unsigned char *src, unsigned char *dst, int src_pitch, \
+ int dst_pitch, int rows, int cols, int flimit)
+
+#define prototype_postproc_addnoise(sym) \
+ void sym(unsigned char *s, char *noise, char blackclamp[16], \
+ char whiteclamp[16], char bothclamp[16], \
+ unsigned int w, unsigned int h, int pitch)
+
+#define prototype_postproc_blend_mb_inner(sym)\
+ void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+ int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_mb_outer(sym)\
+ void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+ int y1, int u1, int v1, int alpha, int stride)
+
+#define prototype_postproc_blend_b(sym)\
+ void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+ int y1, int u1, int v1, int alpha, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/postproc_x86.h"
+#endif
+
+#ifndef vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_c
+#endif
+extern prototype_postproc_inplace(vp9_postproc_down);
+
+#ifndef vp9_postproc_across
+#define vp9_postproc_across vp9_mbpost_proc_across_ip_c
+#endif
+extern prototype_postproc_inplace(vp9_postproc_across);
+
+#ifndef vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_c
+#endif
+extern prototype_postproc(vp9_postproc_downacross);
+
+#ifndef vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_c
+#endif
+extern prototype_postproc_addnoise(vp9_postproc_addnoise);
+
+#ifndef vp9_postproc_blend_mb_inner
+#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c
+#endif
+extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);
+
+#ifndef vp9_postproc_blend_mb_outer
+#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c
+#endif
+extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);
+
+#ifndef vp9_postproc_blend_b
+#define vp9_postproc_blend_b vp9_blend_b_c
+#endif
+extern prototype_postproc_blend_b(vp9_postproc_blend_b);
+
+typedef prototype_postproc((*vp9_postproc_fn_t));
+typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));
+typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));
+typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));
+typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));
+typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));
+typedef struct {
+ vp9_postproc_inplace_fn_t down;
+ vp9_postproc_inplace_fn_t across;
+ vp9_postproc_fn_t downacross;
+ vp9_postproc_addnoise_fn_t addnoise;
+ vp9_postproc_blend_mb_inner_fn_t blend_mb_inner;
+ vp9_postproc_blend_mb_outer_fn_t blend_mb_outer;
+ vp9_postproc_blend_b_fn_t blend_b;
+} vp9_postproc_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn
+#endif
+
+#include "vpx_ports/mem.h"
+struct postproc_state {
+ int last_q;
+ int last_noise;
+ char noise[3072];
+ DECLARE_ALIGNED(16, char, blackclamp[16]);
+ DECLARE_ALIGNED(16, char, whiteclamp[16]);
+ DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *flags);
+
+
+void vp9_de_noise(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag,
+ vp9_postproc_rtcd_vtable_t *rtcd);
+
+void vp9_deblock(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag,
+ vp9_postproc_rtcd_vtable_t *rtcd);
+#endif
--- /dev/null
+++ b/vp9/common/ppc/copy_altivec.asm
@@ -1,0 +1,47 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;# but the output will be. So two reads and a perm
+;# for the input, but only one store for the output.
+copy_mem16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xe000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+cp_16x16_loop:
+ lvsl v0, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v1, v1, v2, v0
+
+ stvx v1, 0, r5
+
+ add r3, r3, r4 ;# increment source pointer
+ add r5, r5, r6 ;# increment destination pointer
+
+ bdnz cp_16x16_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
--- /dev/null
+++ b/vp9/common/ppc/filter_altivec.asm
@@ -1,0 +1,1013 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl sixtap_predict_ppc
+ .globl sixtap_predict8x4_ppc
+ .globl sixtap_predict8x8_ppc
+ .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+ load_c \V0, HFilter, r5, r9, r10
+
+ addi r5, r5, 16
+ lvx \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+ load_c v0, VFilter, r6, r3, r10
+
+ vspltish v5, 8
+ vspltish v6, 3
+ vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v1, v0, 1
+ vspltb v2, v0, 2
+ vspltb v3, v0, 3
+ vspltb v4, v0, 4
+ vspltb v5, v0, 5
+ vspltb v0, v0, 0
+.endm
+
+.macro vpre_load
+ Vprolog
+ li r10, 16
+ lvx v10, 0, r9 ;# v10..v14 = first 5 rows
+ lvx v11, r10, r9
+ addi r9, r9, 32
+ lvx v12, 0, r9
+ lvx v13, r10, r9
+ addi r9, r9, 32
+ lvx v14, 0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+ ;# (Re,Ro) += (V*T)
+ vmuleub \TMP, \V, \T ;# trashes v8
+ vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
+ vmuloub \TMP, \V, \T
+ vadduhm \Ro, \Ro, \TMP ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+ vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
+ vadduhm v16, v6, v8
+ vmuloub v8, \P0, v0
+ vadduhm v17, v6, v8
+ Msum v16, v17, \P2, v2, v8
+ Msum v16, v17, \P3, v3, v8
+ Msum v16, v17, \P5, v5, v8
+
+ vmuleub v18, \P1, v1 ;# 2 negative taps
+ vmuloub v19, \P1, v1
+ Msum v18, v19, \P4, v4, v8
+
+ vsubuhs v16, v16, v18 ;# subtract neg from pos
+ vsubuhs v17, v17, v19
+ vsrh v16, v16, v7 ;# divide by 128
+ vsrh v17, v17, v7 ;# v16 v17 = evens, odds
+ vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
+ vmrglh v19, v16, v17
+ vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+ vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
+ vadduhm v21, v20, v24
+ vmuloub v24, \P0, v13
+ vadduhm v22, v20, v24
+ Msum v21, v22, \P2, v15, v25
+ Msum v21, v22, \P3, v16, v25
+ Msum v21, v22, \P5, v18, v25
+
+ vmuleub v23, \P1, v14 ;# 2 negative taps
+ vmuloub v24, \P1, v14
+ Msum v23, v24, \P4, v17, v25
+
+ vsubuhs v21, v21, v23 ;# subtract neg from pos
+ vsubuhs v22, v22, v24
+ vsrh v21, v21, v19 ;# divide by 128
+ vsrh v22, v22, v19 ;# v16 v17 = evens, odds
+ vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
+ vmrglh v24, v21, v22
+ vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+ vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+ stvx \P0, 0, r7
+ add r7, r7, r8 ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+ addi r9, r9, 16 ;# P5 = newest input row
+ lvx \P5, 0, r9
+ Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+ luma_v v10, v11, v12, v13, v14, v15
+ luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+ luma_vtwo
+ luma_v v12, v13, v14, v15, v10, v11
+ luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+ luma_vfour
+ luma_v v14, v15, v10, v11, v12, v13
+ luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+ vmsummbm \R, v13, \I, v15
+ vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx \VD, 0, \RS
+ lvx v20, r10, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+ vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
+ vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
+ Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
+ vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
+ Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
+
+ vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
+ vsrh \R, \R, v19
+
+ vpkuhus \R, \R, \R ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v20, 0, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, v20, v20, v21
+.endm
+ .text
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xff87
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- vertical_only_4x4
+
+ ;# load up horizontal filter
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_4x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_4x4
+
+vertical_only_4x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_4x4:
+ load_c v20, b_hilo_4x4, 0, r9, r10
+ load_c v21, b_hilo, 0, r9, r10
+
+ ;# reposition input so that it can go through the
+ ;# filtering phase with one pass.
+ vperm v0, v0, v1, v20 ;# 0 1 x x
+ vperm v2, v2, v3, v20 ;# 2 3 x x
+ vperm v4, v4, v5, v20 ;# 4 5 x x
+ vperm v6, v6, v7, v20 ;# 6 7 x x
+
+ vperm v0, v0, v2, v21 ;# 0 1 2 3
+ vperm v4, v4, v6, v21 ;# 4 5 6 7
+
+ vsldoi v1, v0, v4, 4
+ vsldoi v2, v0, v4, 8
+ vsldoi v3, v0, v4, 12
+
+ vsldoi v5, v4, v8, 4
+
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+ stvx v0, 0, r1
+
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 4(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 8(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 12(r1)
+ stw r0, 0(r7)
+
+ b exit_4x4
+
+store_4x4:
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v4, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v5, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x4
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_8x4
+
+second_pass_pre_copy_8x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x4:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+ b exit_8x4
+
+store_8x4:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x4
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned2_8x4:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;# register there is no need to loop. Everything can stay in registers.
+sixtap_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x8
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+ interp_8x8 v9
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x8
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v10
+ interp_8x8 v11
+ interp_8x8 v12
+
+ b second_pass_8x8
+
+second_pass_pre_copy_8x8:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x8:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+ vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
+ vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
+ vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
+ vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+ b exit_8x8
+
+store_8x8:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x8
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+ w_8x8 v8, r7, r0, r8
+ w_8x8 v9, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned2_8x8:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+ vperm v8, v8, v9, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+ addi r7, r7, 16
+ stvx v8, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
+;# edges. One of the filters can be null, but both won't be. Needs to use a
+;# temporary buffer because the source buffer can't be modified and the buffer
+;# for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ ;# Three possiblities
+ ;# 1. First filter is null. Don't use a temp buffer.
+ ;# 2. Second filter is null. Don't use a temp buffer.
+ ;# 3. Neither are null, use temp buffer.
+
+ ;# First Pass (horizontal edge)
+ ;# setup pointers for src
+ ;# if possiblity (1) then setup the src pointer to be the orginal and jump
+ ;# to second pass. this is based on if x_offset is 0.
+
+ ;# load up horizontal filter
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ load_hfilter v4, v5
+
+ beq- copy_horizontal_16x21
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v14, b_hperm, 0, r9, r10
+
+ ;# These statements are guessing that there won't be a second pass,
+ ;# but if there is then inside the bypass they need to be set
+ li r0, 16 ;# prepare for no vertical filter
+
+ ;# Change the output pointer and pitch to be the actual
+ ;# desination instead of a temporary buffer.
+ addi r9, r7, 0
+ addi r5, r8, 0
+
+ ;# no vertical filter, so write the output from the first pass
+ ;# directly into the output buffer.
+ beq- no_vertical_filter_bypass
+
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# setup counter for the number of lines that are going to be filtered
+ li r0, 21
+
+ ;# use the stack as temporary storage
+ la r9, 48(r1)
+ li r5, 16
+
+no_vertical_filter_bypass:
+
+ mtctr r0
+
+ ;# rounding added in on the multiply
+ vspltisw v10, 8
+ vspltisw v12, 3
+ vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v13, 7
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+horizontal_loop_16x16:
+
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+ lvx v3, r12, r3
+
+ vperm v8, v1, v2, v15
+ vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
+
+ vsldoi v11, v8, v9, 4
+
+ ;# set 0
+ vmsummbm v6, v4, v8, v12 ;# taps times elements
+ vmsummbm v0, v5, v11, v6
+
+ ;# set 1
+ vsldoi v10, v8, v9, 1
+ vsldoi v11, v8, v9, 5
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v1, v5, v11, v6
+
+ ;# set 2
+ vsldoi v10, v8, v9, 2
+ vsldoi v11, v8, v9, 6
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v2, v5, v11, v6
+
+ ;# set 3
+ vsldoi v10, v8, v9, 3
+ vsldoi v11, v8, v9, 7
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v3, v5, v11, v6
+
+ vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
+
+ vsrh v0, v0, v13 ;# divide v0, v1 by 128
+ vsrh v1, v1, v13
+
+ vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
+ vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
+
+ stvx v0, 0, r9
+ add r9, r9, r5
+
+ add r3, r3, r4
+
+ bdnz horizontal_loop_16x16
+
+ ;# check again to see if vertical filter needs to be done.
+ cmpi cr0, r6, 0
+ beq cr0, end_16x16
+
+ ;# yes there is, so go to the second pass
+ b second_pass_16x16
+
+copy_horizontal_16x21:
+ li r10, 21
+ mtctr r10
+
+ li r10, 16
+
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# this is done above if there is a horizontal filter,
+ ;# if not it needs to be done down here.
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+ ;# always write to the stack when doing a horizontal copy
+ la r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v8, v1, v2, v15
+
+ stvx v8, 0, r9
+ addi r9, r9, 16
+
+ add r3, r3, r4
+
+ bdnz copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+ ;# always read from the stack when doing a vertical filter
+ la r9, 48(r1)
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v7, 7
+
+ vpre_load
+
+ luma_vsix
+ luma_vsix
+ luma_vfour
+
+end_16x16:
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+HFilter:
+ .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
+ .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
+ .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
+ .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
+ .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+ .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
+ .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
+ .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
+ .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
+ .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
+ .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
+ .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+
+ .align 4
+VFilter:
+ .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ .align 4
+b_hperm:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+B_0123:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+B_4567:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+ .align 4
+B_89AB:
+ .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+ .align 4
+b_hilo:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+b_hilo_4x4:
+ .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
--- /dev/null
+++ b/vp9/common/ppc/filter_bilinear_altivec.asm
@@ -1,0 +1,677 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl bilinear_predict4x4_ppc
+ .globl bilinear_predict8x4_ppc
+ .globl bilinear_predict8x8_ppc
+ .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r9, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r9, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r9, r0
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro HFilter V
+ vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_4x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_4x4_b:
+
+ stvx v0, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v1, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_8x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_8x4_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+ hfilter_8 v4, 1
+ hfilter_8 v5, 1
+ hfilter_8 v6, 1
+ hfilter_8 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x8_b
+
+ hfilter_8 v8, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+ load_and_align_8 v5, 1
+ load_and_align_8 v6, 1
+ load_and_align_8 v7, 1
+ load_and_align_8 v8, 0
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+store_out_8x8_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, 1
+ load_and_align_16 v1, 1
+ load_and_align_16 v2, 1
+ load_and_align_16 v3, 1
+ load_and_align_16 v4, 1
+ load_and_align_16 v5, 1
+ load_and_align_16 v6, 1
+ load_and_align_16 v7, 1
+ load_and_align_16 v8, 1
+ load_and_align_16 v9, 1
+ load_and_align_16 v10, 1
+ load_and_align_16 v11, 1
+ load_and_align_16 v12, 1
+ load_and_align_16 v13, 1
+ load_and_align_16 v14, 1
+ load_and_align_16 v15, 1
+ load_and_align_16 v16, 0
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+ write_16 v0, 1
+ write_16 v1, 1
+ write_16 v2, 1
+ write_16 v3, 1
+ write_16 v4, 1
+ write_16 v5, 1
+ write_16 v6, 1
+ write_16 v7, 1
+ write_16 v8, 1
+ write_16 v9, 1
+ write_16 v10, 1
+ write_16 v11, 1
+ write_16 v12, 1
+ write_16 v13, 1
+ write_16 v14, 1
+ write_16 v15, 0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/common/ppc/idctllm_altivec.asm
@@ -1,0 +1,189 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+ .align 2
+short_idct4x4llm_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ load_c v8, sinpi8sqrt2, 0, r9, r10
+ load_c v9, cospi8sqrt2minus1, 0, r9, r10
+ load_c v10, hi_hi, 0, r9, r10
+ load_c v11, lo_lo, 0, r9, r10
+ load_c v12, shift_16, 0, r9, r10
+
+ li r10, 16
+ lvx v0, 0, r3 ;# input ip[0], ip[ 4]
+ lvx v1, r10, r3 ;# input ip[8], ip[12]
+
+ ;# first pass
+ vupkhsh v2, v0
+ vupkhsh v3, v1
+ vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
+
+ vupklsh v0, v0
+ vmulosh v4, v0, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vupklsh v1, v1
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v3, v1, v8
+ vsraw v3, v3, v12
+ vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v0, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v0
+
+ vaddsws v3, v3, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ ;# transpose input
+ vmrghw v4, v0, v1 ;# a0 b0 a1 b1
+ vmrghw v5, v2, v3 ;# c0 d0 c1 d1
+
+ vmrglw v6, v0, v1 ;# a2 b2 a3 b3
+ vmrglw v7, v2, v3 ;# c2 d2 c3 d3
+
+ vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
+ vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
+
+ vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
+ vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
+
+ ;# second pass
+ vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
+
+ vmulosh v4, v1, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v3, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v3
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v2, v3, v8
+ vsraw v2, v2, v12
+ vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vaddsws v3, v2, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ vspltish v6, 4
+ vspltish v7, 3
+
+ vpkswss v0, v0, v1
+ vpkswss v1, v2, v3
+
+ vaddshs v0, v0, v6
+ vaddshs v1, v1, v6
+
+ vsrah v0, v0, v7
+ vsrah v1, v1, v7
+
+ ;# transpose output
+ vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
+ vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+ vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
+ vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ stvx v0, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ stvx v1, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 4
+sinpi8sqrt2:
+ .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+ .align 4
+cospi8sqrt2minus1:
+ .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+ .align 4
+shift_16:
+ .long 16, 16, 16, 16
+
+ .align 4
+hi_hi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+lo_lo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- /dev/null
+++ b/vp9/common/ppc/loopfilter_altivec.c
@@ -1,0 +1,127 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+ unsigned char *u, // source pointer
+ unsigned char *v, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ // These should all be done at once with one call, instead of 3
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi) {
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
--- /dev/null
+++ b/vp9/common/ppc/loopfilter_filters_altivec.asm
@@ -1,0 +1,1253 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl mbloop_filter_horizontal_edge_y_ppc
+ .globl loop_filter_horizontal_edge_y_ppc
+ .globl mbloop_filter_vertical_edge_y_ppc
+ .globl loop_filter_vertical_edge_y_ppc
+
+ .globl mbloop_filter_horizontal_edge_uv_ppc
+ .globl loop_filter_horizontal_edge_uv_ppc
+ .globl mbloop_filter_vertical_edge_uv_ppc
+ .globl loop_filter_vertical_edge_uv_ppc
+
+ .globl loop_filter_simple_horizontal_edge_ppc
+ .globl loop_filter_simple_vertical_edge_ppc
+
+ .text
+;# We often need to perform transposes (and other transpose-like operations)
+;# on matrices of data. This is simplified by the fact that we usually
+;# operate on hunks of data whose dimensions are powers of 2, or at least
+;# divisible by highish powers of 2.
+;#
+;# These operations can be very confusing. They become more straightforward
+;# when we think of them as permutations of address bits: Concatenate a
+;# group of vector registers and think of it as occupying a block of
+;# memory beginning at address zero. The low four bits 0...3 of the
+;# address then correspond to position within a register, the higher-order
+;# address bits select the register.
+;#
+;# Although register selection, at the code level, is arbitrary, things
+;# are simpler if we use contiguous ranges of register numbers, simpler
+;# still if the low-order bits of the register number correspond to
+;# conceptual address bits. We do this whenever reasonable.
+;#
+;# A 16x16 transpose can then be thought of as an operation on
+;# a 256-element block of memory. It takes 8 bits 0...7 to address this
+;# memory and the effect of a transpose is to interchange address bit
+;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
+;# column, which is interchanged with the row addressed by bits 4..7.
+;#
+;# The altivec merge instructions provide a rapid means of effecting
+;# many of these transforms. They operate at three widths (8,16,32).
+;# Writing V(x) for vector register #x, paired merges permute address
+;# indices as follows.
+;#
+;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
+;#
+;# vmrghb V( x), V( y), V( y + (1<<s))
+;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
+;#
+;# vmrghh V( x), V( y), V( y + (1<<s))
+;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= =1= 2->3 3->(4+d) (4+s)->2:
+;#
+;# vmrghw V( x), V( y), V( y + (1<<s))
+;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# Unfortunately, there is no doubleword merge instruction.
+;# The following sequence uses "vperm" is a substitute.
+;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;# are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
+;#
+;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
+;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;# Except for bits s and d, the other relationships between register
+;# number (= high-order part of address) bits are at the disposal of
+;# the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;# edges together. This requires a single 16x16 transpose, which, in
+;# the above language, amounts to the following permutation of address
+;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
+;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;# Except for the fact that the destination registers get written
+;# before we are done referencing the old contents, the cyclic transform
+;# is effected by
+;#
+;# x = 0; do {
+;# vmrghb V(2x), V(x), V(x+8);
+;# vmrghb V(2x+1), V(x), V(x+8);
+;# } while( ++x < 8);
+;#
+;# For clarity, and because we can afford it, we do this transpose
+;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
+;# leaving the final result in 16 .. 31, as the lower registers are
+;# used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+ vmrghb \A, \X, \Y
+ vmrglb \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+ Tpair v16,v17, v0,v8
+ Tpair v18,v19, v1,v9
+ Tpair v20,v21, v2,v10
+ Tpair v22,v23, v3,v11
+ Tpair v24,v25, v4,v12
+ Tpair v26,v27, v5,v13
+ Tpair v28,v29, v6,v14
+ Tpair v30,v31, v7,v15
+.endm
+
+.macro t16_odd
+ Tpair v0,v1, v16,v24
+ Tpair v2,v3, v17,v25
+ Tpair v4,v5, v18,v26
+ Tpair v6,v7, v19,v27
+ Tpair v8,v9, v20,v28
+ Tpair v10,v11, v21,v29
+ Tpair v12,v13, v22,v30
+ Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+ t16_odd
+ t16_even
+ t16_odd
+ t16_even
+.endm
+
+;# Vertical edge filtering requires transposes. For the simple filter,
+;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;# v0 = 0 1 ... 14 15
+;# v1 = 16 17 ... 30 31
+;# v2 = 32 33 ... 47 48
+;# v3 = 49 50 ... 62 63
+;#
+;# In frame-buffer memory, the layout is:
+;#
+;# 0 16 32 48
+;# 1 17 33 49
+;# ...
+;# 15 31 47 63.
+;#
+;# We begin by reading the data 32 bits at a time (using scalar operations)
+;# into a temporary array, reading the rows of the array into vector registers,
+;# with the following layout:
+;#
+;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
+;# v1 = 1 17 33 49 5 21 ... 45 61
+;# v2 = 2 18 ... 46 62
+;# v3 = 3 19 ... 47 63
+;#
+;# From the "address-bit" perspective discussed above, we simply need to
+;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;# In other words, we transpose each of the four 4x4 submatrices.
+;#
+;# This transformation is its own inverse, and we need to perform it
+;# again before writing the pixels back into the frame buffer.
+;#
+;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;# defined above. We think of both groups of 4 registers as having
+;# "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+ ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
+
+ vmrghb v4, v0, v1
+ vmrglb v5, v0, v1
+ vmrghb v6, v2, v3
+ vmrglb v7, v2, v3
+
+ ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
+
+ vmrghh v0, v4, v6
+ vmrglh v1, v4, v6
+ vmrghh v2, v5, v7
+ vmrglh v3, v5, v7
+
+ ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
+
+ vmrghw v4, v0, v1
+ vmrglw v5, v0, v1
+ vmrghw v6, v2, v3
+ vmrglw v7, v2, v3
+
+ ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
+
+ vperm v0, v4, v6, \Vlo
+ vperm v1, v4, v6, \Vhi
+ vperm v2, v5, v7, \Vlo
+ vperm v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;# We read 8 columns of data, initially in the following pattern:
+;#
+;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
+;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
+;# ...
+;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;# and wish to convert to:
+;#
+;# (0,0) ... (0,15)
+;# (1,0) ... (1,15)
+;# ...
+;# (7,0) ... (7,15).
+;#
+;# In "address bit" language, we wish to map
+;#
+;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
+;#
+;# This can be accomplished by 4 iterations of the cyclic transform
+;#
+;# I -> (I+1) mod 7;
+;#
+;# each iteration can be realized by (d=0, s=2):
+;#
+;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
+;#
+;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
+;# preserving v8 = sign converter.
+;#
+;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;# result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+ Tpair v10, v11, v0, v4
+ Tpair v12, v13, v1, v5
+ Tpair v14, v15, v2, v6
+ Tpair v16, v17, v3, v7
+.endm
+
+.macro t8x16_even
+ Tpair v0, v1, v10, v14
+ Tpair v2, v3, v11, v15
+ Tpair v4, v5, v12, v16
+ Tpair v6, v7, v13, v17
+.endm
+
+.macro transpose8x16_fwd
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+ t8x16_even
+.endm
+
+.macro transpose8x16_inv
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+.endm
+
+.macro Transpose16x16
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;# into vector register Vreg. Trashes r0
+.macro load_g Vreg, Gptr
+ lwz r0, \Gptr
+ lvx \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here. if the answer is negative
+;# it will be clamped to 0. orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+ vsububs \RES, \A, \B
+ vsububs \TMP, \B, \A
+ vor \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+ vsububs \TMP, \A, \B
+ vmaxub \RES, \RES, \TMP
+ vsububs \TMP, \B, \A
+ vmaxub \RES, \RES, \TMP
+.endm
+
+.macro Masks
+ ;# build masks
+ ;# input is all 8 bit unsigned (0-255). need to
+ ;# do abs(vala-valb) > limit. but no need to compare each
+ ;# value to the limit. find the max of the absolute differences
+ ;# and compare that to the limit.
+ ;# First hev
+ Abs v14, v13, v2, v3 ;# |P1 - P0|
+ max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
+
+ vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
+
+ ;# Next limit
+ max_abs v14, v13, v0, v1 ;# |P3 - P2|
+ max_abs v14, v13, v1, v2 ;# |P2 - P1|
+ max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
+ max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
+
+ vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
+
+ ;# flimit
+ Abs v14, v13, v3, v4 ;# |P0 - Q0|
+
+ vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
+
+ vor v8, v8, v9 ;# R = true if flimit or limit exceeded
+ ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+ ;# build constants
+ lvx \FL, 0, \RFL ;# flimit
+ lvx \LI, 0, \RLI ;# limit
+ lvx \TH, 0, \RTH ;# thresh
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+ ;# setup strides/pointers to be able to access
+ ;# all of the data
+ add r5, r4, r4 ;# r5 = 2 * stride
+ sub r6, r3, r5 ;# r6 -> 2 rows back
+ neg r7, r4 ;# r7 = -stride
+
+ ;# load 16 pixels worth of data to work on
+ sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
+ lvx v0, 0, r0 ;# P3 (read only)
+ lvx v1, r7, r6 ;# P2
+ lvx v2, 0, r6 ;# P1
+ lvx v3, r7, r3 ;# P0
+ lvx v4, 0, r3 ;# Q0
+ lvx v5, r4, r3 ;# Q1
+ lvx v6, r5, r3 ;# Q2
+ add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
+ lvx v7, r4, r0 ;# Q3 (read only)
+.endm
+
+;# Expects
+;# v10 == HEV
+;# v13 == tmp
+;# v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+ vxor \P1, \P1, v11 ;# SP1
+ vxor \P0, \P0, v11 ;# SP0
+ vxor \Q0, \Q0, v11 ;# SQ0
+ vxor \Q1, \Q1, v11 ;# SQ1
+
+ vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
+.if \HEV_PRESENT
+ vand v13, v13, v10 ;# f &= hev
+.endif
+ vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+ vandc v13, v13, v8 ;# f &= mask
+
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v13, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v13, v8 ;# f2 = c (f+3)
+
+ vsrab v13, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
+ vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+ Masks
+
+ ;# start the fitering here
+ vxor v1, v1, v11 ;# SP2
+ vxor v2, v2, v11 ;# SP1
+ vxor v3, v3, v11 ;# SP0
+ vxor v4, v4, v11 ;# SQ0
+ vxor v5, v5, v11 ;# SQ1
+ vxor v6, v6, v11 ;# SQ2
+
+ ;# add outer taps if we have high edge variance
+ vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
+
+ vsubsbs v14, v4, v3 ;# SQ0-SP0
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+ vandc v13, v13, v8 ;# f &= mask
+ vand v15, v13, v10 ;# f2 = f & hev
+
+ ;# save bottom 3 bits so that we round one side +4 and the other +3
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v15, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v15, v8 ;# f2 = c (f+3)
+
+ vsrab v14, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
+ vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
+
+ ;# only apply wider filter if not high edge variance
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vspltisb v9, 2
+ vnor v8, v8, v8
+ vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+ vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
+ vspltisb v8, 9
+
+ ;# roughly 1/7th difference across boundary
+ vspltish v10, 7
+ vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v8, v13
+ vaddshs v14, v14, v9 ;# += 63
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v6, v6, v10 ;# subtract from Q and add to P
+ vaddsbs v1, v1, v10
+
+ vxor v6, v6, v11
+ vxor v1, v1, v11
+
+ ;# roughly 2/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v8, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v5, v5, v10 ;# subtract from Q and add to P
+ vaddsbs v2, v2, v10
+
+ vxor v5, v5, v11
+ vxor v2, v2, v11
+
+ ;# roughly 3/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v12, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v4, v4, v10 ;# subtract from Q and add to P
+ vaddsbs v3, v3, v10
+
+ vxor v4, v4, v11
+ vxor v3, v3, v11
+.endm
+
+.macro SBFilter
+ Masks
+
+ common_adjust v3, v4, v2, v5, 1
+
+ ;# outer tap adjustments
+ vspltisb v8, 1
+
+ vaddubm v13, v13, v8 ;# f += 1
+ vsrab v13, v13, v8 ;# f >>= 1
+
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
+ vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
+
+ vxor v2, v2, v11
+ vxor v3, v3, v11
+ vxor v4, v4, v11
+ vxor v5, v5, v11
+.endm
+
+ .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ vp8_mbfilter
+
+ stvx v1, r7, r6 ;# P2
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+ stvx v6, r5, r3 ;# Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ SBFilter
+
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
+;# So we can read in an entire mb aligned. However if we want to filter the mb
+;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
+;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
+;# of a waste. So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;# memory to align and order them up. Then they are read in using the
+;# vector register file.
+.macro RLVmb V, R
+ lwzux r0, r3, r4
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r3, r4
+ stw r0,12(\R)
+ lwz r0,-4(r3)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r3, r4
+ lwz r0, 8(\R)
+ stw r0,-4(r3)
+ lwz r0, 4(\R)
+ stwux r0, r3, r4
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+ sub r3, r3, r4
+
+ RLVmb v0, r9
+ RLVmb v1, r9
+ RLVmb v2, r9
+ RLVmb v3, r9
+ RLVmb v4, r9
+ RLVmb v5, r9
+ RLVmb v6, r9
+ RLVmb v7, r9
+
+ transpose8x16_fwd
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ vp8_mbfilter
+
+ transpose8x16_inv
+
+ add r3, r3, r4
+ neg r4, r4
+
+ WLVmb v17, r9
+ WLVmb v16, r9
+ WLVmb v15, r9
+ WLVmb v14, r9
+ WLVmb v13, r9
+ WLVmb v12, r9
+ WLVmb v11, r9
+ WLVmb v10, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RL V, R, P
+ lvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro WL V, R, P
+ stvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+ ;# K = |P0-P1| already
+ Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
+ vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
+ vcmpgtub v10, v14, v0
+
+ Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
+
+ max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
+ max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
+ max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
+
+ vmaxub v14, v14, v4 ;# M = max interior abs diff
+ vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
+
+ Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
+ vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
+ vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
+
+ ;# replace P1,Q1 w/signed versions
+ common_adjust \P0, \Q0, \P1, \Q1, 1
+
+ vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
+ vsrab v13, v13, v1
+ vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
+ vsubsbs \Q1, \Q1, v13
+ vaddsbs \P1, \P1, v13
+
+ vxor \P1, \P1, v11 ;# P1
+ vxor \P0, \P0, v11 ;# P0
+ vxor \Q0, \Q0, v11 ;# Q0
+ vxor \Q1, \Q1, v11 ;# Q1
+.endm
+
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ addi r9, r3, 0
+ RL v16, r9, r4
+ RL v17, r9, r4
+ RL v18, r9, r4
+ RL v19, r9, r4
+ RL v20, r9, r4
+ RL v21, r9, r4
+ RL v22, r9, r4
+ RL v23, r9, r4
+ RL v24, r9, r4
+ RL v25, r9, r4
+ RL v26, r9, r4
+ RL v27, r9, r4
+ RL v28, r9, r4
+ RL v29, r9, r4
+ RL v30, r9, r4
+ lvx v31, 0, r9
+
+ Transpose16x16
+
+ vspltisb v1, 1
+
+ build_constants r5, r6, r7, v3, v2, v0
+
+ Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
+
+ Fil v16, v17, v18, v19, v20, v21, v22, v23
+ Fil v20, v21, v22, v23, v24, v25, v26, v27
+ Fil v24, v25, v26, v27, v28, v29, v30, v31
+
+ Transpose16x16
+
+ addi r9, r3, 0
+ WL v16, r9, r4
+ WL v17, r9, r4
+ WL v18, r9, r4
+ WL v19, r9, r4
+ WL v20, r9, r4
+ WL v21, r9, r4
+ WL v22, r9, r4
+ WL v23, r9, r4
+ WL v24, r9, r4
+ WL v25, r9, r4
+ WL v26, r9, r4
+ WL v27, r9, r4
+ WL v28, r9, r4
+ WL v29, r9, r4
+ WL v30, r9, r4
+ stvx v31, 0, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+ andi. r7, r3, 8 ;# row origin modulo 16
+ add r7, r7, r7 ;# selects selectors
+ lis r12, _chromaSelectors@ha
+ la r0, _chromaSelectors@l(r12)
+ lwzux r0, r7, r0 ;# leave selector addr in r7
+
+ lvx \V, 0, r0 ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+ lvx \U, \Offs, r3
+ lvx \V, \Offs, r4
+ vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+ vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
+ vperm \V, \New, \V, \Vmask
+ stvx \U, \Offs, r3 ;# Write to frame buffer
+ stvx \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+ neg r9, r5 ;# r9 = -1 * stride
+ add r8, r9, r9 ;# r8 = -2 * stride
+ add r10, r5, r5 ;# r10 = 2 * stride
+
+ active_chroma_sel v12
+
+ ;# P3, Q3 are read-only; need not save addresses or sibling pels
+ add r6, r8, r8 ;# r6 = -4 * stride
+ hread_uv v0, v14, v15, r6, v12
+ add r6, r10, r5 ;# r6 = 3 * stride
+ hread_uv v7, v14, v15, r6, v12
+
+ ;# Others are read/write; save addresses and sibling pels
+
+ add r6, r8, r9 ;# r6 = -3 * stride
+ hread_uv v1, v16, v17, r6, v12
+ hread_uv v2, v18, v19, r8, v12
+ hread_uv v3, v20, v21, r9, v12
+ hread_uv v4, v22, v23, 0, v12
+ hread_uv v5, v24, v25, r5, v12
+ hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+ load_g \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+ load_g \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+ uresult_sel v11
+ vresult_sel v12
+ hwrite_uv v2, v18, v19, r8, v11, v12
+ hwrite_uv v3, v20, v21, r9, v11, v12
+ hwrite_uv v4, v22, v23, 0, v11, v12
+ hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ vp8_mbfilter
+
+ store_chroma_h
+
+ hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
+ hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ SBFilter
+
+ store_chroma_h
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro R V, R
+ lwzux r0, r3, r5
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r4, r5
+ stw r0,12(\R)
+ lwz r0,-4(r4)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+
+.macro W V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r4, r5
+ lwz r0, 8(\R)
+ stw r0,-4(r4)
+ lwz r0, 4(\R)
+ stwux r0, r3, r5
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+ sub r3, r3, r5 ;# back up one line for simplicity
+ sub r4, r4, r5
+
+ R v0, \R
+ R v1, \R
+ R v2, \R
+ R v3, \R
+ R v4, \R
+ R v5, \R
+ R v6, \R
+ R v7, \R
+
+ transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+ transpose8x16_inv
+
+ add r3, r3, r5
+ add r4, r4, r5
+ neg r5, r5 ;# Write rows back in reverse order
+
+ W v17, \R
+ W v16, \R
+ W v15, \R
+ W v14, \R
+ W v13, \R
+ W v12, \R
+ W v11, \R
+ W v10, \R
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ vp8_mbfilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ SBFilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+ Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
+ vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
+
+ ;# preserve unsigned v0 and v3
+ common_adjust v1, v2, v0, v3, 0
+
+ vxor v1, v1, v11
+ vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+ addi r8, 0, 16
+ addi r7, r5, 32
+
+ lvx v0, 0, r5
+ lvx v1, r8, r5
+ lvx v2, 0, r7
+ lvx v3, r8, r7
+
+ lis r12, _B_hihi@ha
+ la r0, _B_hihi@l(r12)
+ lvx v16, 0, r0
+
+ lis r12, _B_lolo@ha
+ la r0, _B_lolo@l(r12)
+ lvx v17, 0, r0
+
+ Transpose4times4x4 v16, v17
+ vp8_simple_filter
+
+ vxor v0, v0, v11
+ vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
+
+ Transpose4times4x4 v16, v17
+
+ stvx v0, 0, r5
+ stvx v1, r8, r5
+ stvx v2, 0, r7
+ stvx v3, r8, r7
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ neg r5, r4 ;# r5 = -1 * stride
+ add r6, r5, r5 ;# r6 = -2 * stride
+
+ lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
+ lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
+ lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
+ lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
+
+ vp8_simple_filter
+
+ stvx v1, r5, r3 ;# store P0
+ stvx v2, 0, r3 ;# store Q0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RLV Offs
+ stw r0, (\Offs*4)(r5)
+ lwzux r0, r7, r4
+.endm
+
+.macro WLV Offs
+ lwz r0, (\Offs*4)(r5)
+ stwux r0, r7, r4
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ la r5, -96(r1) ;# temporary space for reading in vectors
+
+ ;# Store 4 pels at word "Offs" in temp array, then advance r7
+ ;# to next row and read another 4 pels from the frame buffer.
+
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r7 ;# read first 4 pels
+
+ ;# 16 unaligned word accesses
+ RLV 0
+ RLV 4
+ RLV 8
+ RLV 12
+ RLV 1
+ RLV 5
+ RLV 9
+ RLV 13
+ RLV 2
+ RLV 6
+ RLV 10
+ RLV 14
+ RLV 3
+ RLV 7
+ RLV 11
+
+ stw r0, (15*4)(r5) ;# write last 4 pels
+
+ simple_vertical
+
+ ;# Read temp array, write frame buffer.
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r5 ;# read/write first 4 pels
+ stwx r0, 0, r7
+
+ WLV 4
+ WLV 8
+ WLV 12
+ WLV 1
+ WLV 5
+ WLV 9
+ WLV 13
+ WLV 2
+ WLV 6
+ WLV 10
+ WLV 14
+ WLV 3
+ WLV 7
+ WLV 11
+ WLV 15
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+_chromaSelectors:
+ .long _B_hihi
+ .long _B_Ures0
+ .long _B_Vres0
+ .long 0
+ .long _B_lolo
+ .long _B_Ures8
+ .long _B_Vres8
+ .long 0
+
+ .align 4
+_B_Vres8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
+
+ .align 4
+_B_Ures8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
+
+ .align 4
+_B_lolo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_Vres0:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+ .align 4
+_B_Ures0:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_hihi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/common/ppc/platform_altivec.asm
@@ -1,0 +1,59 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl save_platform_context
+ .globl restore_platform_context
+
+.macro W V P
+ stvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+.macro R V P
+ lvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+;# r3 context_ptr
+ .align 2
+save_platform_contex:
+ W v20, r3
+ W v21, r3
+ W v22, r3
+ W v23, r3
+ W v24, r3
+ W v25, r3
+ W v26, r3
+ W v27, r3
+ W v28, r3
+ W v29, r3
+ W v30, r3
+ W v31, r3
+
+ blr
+
+;# r3 context_ptr
+ .align 2
+restore_platform_context:
+ R v20, r3
+ R v21, r3
+ R v22, r3
+ R v23, r3
+ R v24, r3
+ R v25, r3
+ R v26, r3
+ R v27, r3
+ R v28, r3
+ R v29, r3
+ R v30, r3
+ R v31, r3
+
+ blr
--- /dev/null
+++ b/vp9/common/ppc/recon_altivec.asm
@@ -1,0 +1,175 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl recon4b_ppc
+ .globl recon2b_ppc
+ .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ addi \Pred, \Pred, 16 ;# next pred
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v3 = d8..d15
+ addi \Diff, \Diff, 32 ;# next diff
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
+ stvx v2, 0, \Dst ;# to dst
+ add \Dst, \Dst, \Stride ;# next dst
+.endm
+
+ .text
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon4b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v2 = d8..d15
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
+ stvx v2, 0, r10 ;# 2 rows to dst from buf
+ lwz r0, 0(r10)
+.if \write_first_four_pels
+ stw r0, 0(\Dst)
+ .else
+ stwux r0, \Dst, \Stride
+.endif
+ lwz r0, 4(r10)
+ stw r0, 4(\Dst)
+ lwz r0, 8(r10)
+ stwux r0, \Dst, \Stride ;# advance dst to next row
+ lwz r0, 12(r10)
+ stw r0, 4(\Dst)
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+
+recon2b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ la r10, -48(r1) ;# buf
+
+ two_rows_of8 r3, r4, r5, r6, 1
+
+ addi r4, r4, 16; ;# next pred
+ addi r3, r3, 32; ;# next diff
+
+ two_rows_of8 r3, r4, r5, r6, 0
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro get_two_diff_rows
+ stw r0, 0(r10)
+ lwz r0, 4(r3)
+ stw r0, 4(r10)
+ lwzu r0, 32(r3)
+ stw r0, 8(r10)
+ lwz r0, 4(r3)
+ stw r0, 12(r10)
+ lvx v3, 0, r10
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon_b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+
+ la r10, -48(r1) ;# buf
+
+ lwz r0, 0(r4)
+ stw r0, 0(r10)
+ lwz r0, 16(r4)
+ stw r0, 4(r10)
+ lwz r0, 32(r4)
+ stw r0, 8(r10)
+ lwz r0, 48(r4)
+ stw r0, 12(r10)
+
+ lvx v1, 0, r10; ;# v1 = pred = p0..p15
+
+ lwz r0, 0(r3) ;# v3 = d0..d7
+
+ get_two_diff_rows
+
+ vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
+ vaddshs v2, v2, v3; ;# v2 = r0..r7
+
+ lwzu r0, 32(r3) ;# v3 = d8..d15
+
+ get_two_diff_rows
+
+ vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
+ vaddshs v3, v3, v1; ;# v3 = r8..r15
+
+ vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
+ stvx v2, 0, r10; ;# 16 pels to dst from buf
+
+ lwz r0, 0(r10)
+ stw r0, 0(r5)
+ lwz r0, 4(r10)
+ stwux r0, r5, r6
+ lwz r0, 8(r10)
+ stwux r0, r5, r6
+ lwz r0, 12(r10)
+ stwx r0, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
--- /dev/null
+++ b/vp9/common/ppc/systemdependent.c
@@ -1,0 +1,167 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp9_post_proc_down_and_across)(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int rows,
+ int cols,
+ int flimit
+);
+
+extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp9_post_proc_down_and_across_c
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int rows,
+ int cols,
+ int flimit
+);
+void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp9_copy_mem16x16;
+extern copy_mem_block_function *vp9_copy_mem8x8;
+extern copy_mem_block_function *vp9_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp9_sixtap_predict_c;
+extern subpixel_predict_function vp9_sixtap_predict8x4_c;
+extern subpixel_predict_function vp9_sixtap_predict8x8_c;
+extern subpixel_predict_function vp9_sixtap_predict16x16_c;
+extern subpixel_predict_function vp9_bilinear_predict4x4_c;
+extern subpixel_predict_function vp9_bilinear_predict8x4_c;
+extern subpixel_predict_function vp9_bilinear_predict8x8_c;
+extern subpixel_predict_function vp9_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp9_copy_mem16x16_c;
+extern copy_mem_block_function vp9_copy_mem8x8_c;
+extern copy_mem_block_function vp9_copy_mem8x4_c;
+
+void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp9_loop_filter_mbv_c;
+extern loop_filter_block_function vp9_loop_filter_bv_c;
+extern loop_filter_block_function vp9_loop_filter_mbh_c;
+extern loop_filter_block_function vp9_loop_filter_bh_c;
+
+extern loop_filter_block_function vp9_loop_filter_mbvs_c;
+extern loop_filter_block_function vp9_loop_filter_bvs_c;
+extern loop_filter_block_function vp9_loop_filter_mbhs_c;
+extern loop_filter_block_function vp9_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp9_clear_c(void) {
+}
+
+void vp9_machine_specific_config(void) {
+ // Pure C:
+ vp9_clear_system_state = vp9_clear_c;
+ vp9_recon_b = vp9_recon_b_c;
+ vp9_recon4b = vp9_recon4b_c;
+ vp9_recon2b = vp9_recon2b_c;
+
+ vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc;
+ vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc;
+ vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc;
+ vp8_bilinear_predict = bilinear_predict4x4_ppc;
+
+ vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc;
+ vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc;
+ vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc;
+ vp9_sixtap_predict = sixtap_predict_ppc;
+
+ vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c;
+ vp8_short_idct4x4 = short_idct4x4llm_ppc;
+ vp8_dc_only_idct = vp8_dc_only_idct_c;
+
+ vp8_lf_mbvfull = loop_filter_mbv_ppc;
+ vp8_lf_bvfull = loop_filter_bv_ppc;
+ vp8_lf_mbhfull = loop_filter_mbh_ppc;
+ vp8_lf_bhfull = loop_filter_bh_ppc;
+
+ vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
+ vp8_lf_bvsimple = loop_filter_bvs_ppc;
+ vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
+ vp8_lf_bhsimple = loop_filter_bhs_ppc;
+
+ vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c;
+ vp9_mbpost_proc_down = vp9_mbpost_proc_down_c;
+ vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c;
+ vp9_plane_add_noise = vp9_plane_add_noise_c;
+
+ vp9_copy_mem16x16 = copy_mem16x16_ppc;
+ vp9_copy_mem8x8 = vp9_copy_mem8x8_c;
+ vp9_copy_mem8x4 = vp9_copy_mem8x4_c;
+
+}
--- /dev/null
+++ b/vp9/common/ppflags.h
@@ -1,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PPFLAGS_H
+#define __INC_PPFLAGS_H
+enum {
+ VP9D_NOFILTERING = 0,
+ VP9D_DEBLOCK = 1 << 0,
+ VP9D_DEMACROBLOCK = 1 << 1,
+ VP9D_ADDNOISE = 1 << 2,
+ VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
+ VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
+ VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
+ VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
+ VP9D_DEBUG_DRAW_MV = 1 << 7,
+ VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
+ VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+};
+
+typedef struct {
+ int post_proc_flag;
+ int deblocking_level;
+ int noise_level;
+ int display_ref_frame_flag;
+ int display_mb_modes_flag;
+ int display_b_modes_flag;
+ int display_mv_flag;
+} vp9_ppflags_t;
+
+#endif
--- /dev/null
+++ b/vp9/common/pragmas.h
@@ -1,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:997 1011 170)
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable:4799)
+#endif
--- /dev/null
+++ b/vp9/common/pred_common.c
@@ -1,0 +1,463 @@
+
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/pred_common.h"
+#include "vp9/common/seg_common.h"
+
+// TBD prediction functions for various bitstream signals
+
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id) {
+ int pred_context;
+ MODE_INFO *m = xd->mode_info_context;
+
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ switch (pred_id) {
+ case PRED_SEG_ID:
+ pred_context = (m - 1)->mbmi.seg_id_predicted +
+ (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+ break;
+
+
+ case PRED_REF:
+ pred_context = (m - 1)->mbmi.ref_predicted +
+ (m - cm->mode_info_stride)->mbmi.ref_predicted;
+ break;
+
+ case PRED_COMP:
+ // Context based on use of comp pred flag by neighbours
+ // pred_context =
+ // ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +
+ // ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);
+
+ // Context based on mode and reference frame
+ // if ( m->mbmi.ref_frame == LAST_FRAME )
+ // pred_context = 0 + (m->mbmi.mode != ZEROMV);
+ // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
+ // pred_context = 2 + (m->mbmi.mode != ZEROMV);
+ // else
+ // pred_context = 4 + (m->mbmi.mode != ZEROMV);
+
+ if (m->mbmi.ref_frame == LAST_FRAME)
+ pred_context = 0;
+ else
+ pred_context = 1;
+
+ break;
+
+ case PRED_MBSKIP:
+ pred_context = (m - 1)->mbmi.mb_skip_coeff +
+ (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
+ break;
+
+ case PRED_SWITCHABLE_INTERP:
+ {
+ int left_in_image = (m - 1)->mbmi.mb_in_image;
+ int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
+ int left_mode = (m - 1)->mbmi.mode;
+ int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
+ int left_interp, above_interp;
+ if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
+ left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
+ else
+ left_interp = VP9_SWITCHABLE_FILTERS;
+ if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
+ above_interp = vp9_switchable_interp_map[
+ (m - cm->mode_info_stride)->mbmi.interp_filter];
+ else
+ above_interp = VP9_SWITCHABLE_FILTERS;
+
+ if (left_interp == above_interp)
+ pred_context = left_interp;
+ else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+ above_interp != VP9_SWITCHABLE_FILTERS)
+ pred_context = above_interp;
+ else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+ above_interp == VP9_SWITCHABLE_FILTERS)
+ pred_context = left_interp;
+ else
+ pred_context = VP9_SWITCHABLE_FILTERS;
+ }
+ break;
+
+ default:
+ // TODO *** add error trap code.
+ pred_context = 0;
+ break;
+ }
+
+ return pred_context;
+}
+
+// This function returns a context probability for coding a given
+// prediction signal
+vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id) {
+ vp9_prob pred_probability;
+ int pred_context;
+
+ // Get the appropriate prediction context
+ pred_context = vp9_get_pred_context(cm, xd, pred_id);
+
+ switch (pred_id) {
+ case PRED_SEG_ID:
+ pred_probability = cm->segment_pred_probs[pred_context];
+ break;
+
+ case PRED_REF:
+ pred_probability = cm->ref_pred_probs[pred_context];
+ break;
+
+ case PRED_COMP:
+ // In keeping with convention elsewhre the probability returned is
+ // the probability of a "0" outcome which in this case means the
+ // probability of comp pred off.
+ pred_probability = cm->prob_comppred[pred_context];
+ break;
+
+ case PRED_MBSKIP:
+ pred_probability = cm->mbskip_pred_probs[pred_context];
+ break;
+
+ default:
+ // TODO *** add error trap code.
+ pred_probability = 128;
+ break;
+ }
+
+ return pred_probability;
+}
+
+// This function returns a context probability ptr for coding a given
+// prediction signal
+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id) {
+ const vp9_prob *pred_probability;
+ int pred_context;
+
+ // Get the appropriate prediction context
+ pred_context = vp9_get_pred_context(cm, xd, pred_id);
+
+ switch (pred_id) {
+ case PRED_SEG_ID:
+ pred_probability = &cm->segment_pred_probs[pred_context];
+ break;
+
+ case PRED_REF:
+ pred_probability = &cm->ref_pred_probs[pred_context];
+ break;
+
+ case PRED_COMP:
+ // In keeping with convention elsewhre the probability returned is
+ // the probability of a "0" outcome which in this case means the
+ // probability of comp pred off.
+ pred_probability = &cm->prob_comppred[pred_context];
+ break;
+
+ case PRED_MBSKIP:
+ pred_probability = &cm->mbskip_pred_probs[pred_context];
+ break;
+
+ case PRED_SWITCHABLE_INTERP:
+ pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];
+ break;
+
+ default:
+ // TODO *** add error trap code.
+ pred_probability = NULL;
+ break;
+ }
+
+ return pred_probability;
+}
+
+// This function returns the status of the given prediction signal.
+// I.e. is the predicted value for the given signal correct.
+unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
+ PRED_ID pred_id) {
+ unsigned char pred_flag = 0;
+
+ switch (pred_id) {
+ case PRED_SEG_ID:
+ pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;
+ break;
+
+ case PRED_REF:
+ pred_flag = xd->mode_info_context->mbmi.ref_predicted;
+ break;
+
+ case PRED_MBSKIP:
+ pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+ break;
+
+ default:
+ // TODO *** add error trap code.
+ pred_flag = 0;
+ break;
+ }
+
+ return pred_flag;
+}
+
+// This function sets the status of the given prediction signal.
+// I.e. is the predicted value for the given signal correct.
+void vp9_set_pred_flag(MACROBLOCKD *const xd,
+ PRED_ID pred_id,
+ unsigned char pred_flag) {
+#if CONFIG_SUPERBLOCKS
+ const int mis = xd->mode_info_stride;
+#endif
+
+ switch (pred_id) {
+ case PRED_SEG_ID:
+ xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
+ if (xd->mb_to_bottom_edge > 0) {
+ xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
+ }
+ }
+#endif
+ break;
+
+ case PRED_REF:
+ xd->mode_info_context->mbmi.ref_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
+ if (xd->mb_to_bottom_edge > 0) {
+ xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
+ }
+ }
+#endif
+ break;
+
+ case PRED_MBSKIP:
+ xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
+ if (xd->mb_to_bottom_edge > 0) {
+ xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
+ if (xd->mb_to_right_edge > 0)
+ xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
+ }
+ }
+#endif
+ break;
+
+ default:
+ // TODO *** add error trap code.
+ break;
+ }
+}
+
+
+// The following contain the guts of the prediction code used to
+// peredict various bitstream signals.
+
+// Macroblock segment id prediction function
+unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd, int MbIndex) {
+ // Currently the prediction for the macroblock segment ID is
+ // the value stored for this macroblock in the previous frame.
+#if CONFIG_SUPERBLOCKS
+ if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+ return cm->last_frame_seg_map[MbIndex];
+#if CONFIG_SUPERBLOCKS
+ } else {
+ int seg_id = cm->last_frame_seg_map[MbIndex];
+ int mb_col = MbIndex % cm->mb_cols;
+ int mb_row = MbIndex / cm->mb_cols;
+ if (mb_col + 1 < cm->mb_cols)
+ seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
+ if (mb_row + 1 < cm->mb_rows) {
+ seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
+ if (mb_col + 1 < cm->mb_cols)
+ seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
+ }
+ return seg_id;
+ }
+#endif
+}
+
+MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd) {
+ MODE_INFO *m = xd->mode_info_context;
+
+ MV_REFERENCE_FRAME left;
+ MV_REFERENCE_FRAME above;
+ MV_REFERENCE_FRAME above_left;
+ MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
+
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+ int seg_ref_active;
+ int i;
+
+ unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
+ unsigned char ref_score[MAX_REF_FRAMES];
+ unsigned char best_score = 0;
+ unsigned char left_in_image;
+ unsigned char above_in_image;
+ unsigned char above_left_in_image;
+
+ // Is segment coding ennabled
+ seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+ // Special case treatment if segment coding is enabled.
+ // Dont allow prediction of a reference frame that the segment
+ // does not allow
+ if (seg_ref_active) {
+ for (i = 0; i < MAX_REF_FRAMES; i++) {
+ frame_allowed[i] =
+ vp9_check_segref(xd, segment_id, i);
+
+ // Score set to 0 if ref frame not allowed
+ ref_score[i] = cm->ref_scores[i] * frame_allowed[i];
+ }
+ } else
+ vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));
+
+ // Reference frames used by neighbours
+ left = (m - 1)->mbmi.ref_frame;
+ above = (m - cm->mode_info_stride)->mbmi.ref_frame;
+ above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
+
+ // Are neighbours in image
+ left_in_image = (m - 1)->mbmi.mb_in_image;
+ above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
+ above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;
+
+ // Adjust scores for candidate reference frames based on neigbours
+ if (frame_allowed[left] && left_in_image) {
+ ref_score[left] += 16;
+ if (above_left_in_image && (left == above_left))
+ ref_score[left] += 4;
+ }
+ if (frame_allowed[above] && above_in_image) {
+ ref_score[above] += 16;
+ if (above_left_in_image && (above == above_left))
+ ref_score[above] += 4;
+ }
+
+ // Now choose the candidate with the highest score
+ for (i = 0; i < MAX_REF_FRAMES; i++) {
+ if (ref_score[i] > best_score) {
+ pred_ref = i;
+ best_score = ref_score[i];
+ }
+ }
+
+ return pred_ref;
+}
+
+// Functions to computes a set of modified reference frame probabilities
+// to use when the prediction of the reference frame value fails
+void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
+ int tot_count;
+
+ tot_count = count[0] + count[1] + count[2] + count[3];
+ if (tot_count) {
+ probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
+ probs[0] += !probs[0];
+ } else
+ probs[0] = 128;
+
+ tot_count -= count[0];
+ if (tot_count) {
+ probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
+ probs[1] += !probs[1];
+ } else
+ probs[1] = 128;
+
+ tot_count -= count[1];
+ if (tot_count) {
+ probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
+ probs[2] += !probs[2];
+ } else
+ probs[2] = 128;
+
+}
+
+// Computes a set of modified conditional probabilities for the reference frame
+// Values willbe set to 0 for reference frame options that are not possible
+// because wither they were predicted and prediction has failed or because
+// they are not allowed for a given segment.
+void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
+ int norm_cnt[MAX_REF_FRAMES];
+ int intra_count;
+ int inter_count;
+ int last_count;
+ int gfarf_count;
+ int gf_count;
+ int arf_count;
+
+ intra_count = cm->prob_intra_coded;
+ inter_count = (255 - intra_count);
+ last_count = (inter_count * cm->prob_last_coded) / 255;
+ gfarf_count = inter_count - last_count;
+ gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
+ arf_count = gfarf_count - gf_count;
+
+ // Work out modified reference frame probabilities to use where prediction
+ // of the reference frame fails
+ norm_cnt[0] = 0;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);
+ cm->mod_refprobs[INTRA_FRAME][0] = 0; // This branch implicit
+
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = 0;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);
+ cm->mod_refprobs[LAST_FRAME][1] = 0; // This branch implicit
+
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = 0;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);
+ cm->mod_refprobs[GOLDEN_FRAME][2] = 0; // This branch implicit
+
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = 0;
+ vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);
+ cm->mod_refprobs[ALTREF_FRAME][2] = 0; // This branch implicit
+
+ // Score the reference frames based on overal frequency.
+ // These scores contribute to the prediction choices.
+ // Max score 17 min 1
+ cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);
+ cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);
+ cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);
+ cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);
+}
--- /dev/null
+++ b/vp9/common/pred_common.h
@@ -1,0 +1,56 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "type_aliases.h"
+#include "onyxc_int.h"
+#include "vp9/common/blockd.h"
+
+#ifndef __INC_PRED_COMMON_H__
+#define __INC_PRED_COMMON_H__ 1
+
+
+// Predicted items
+typedef enum {
+ PRED_SEG_ID = 0, // Segment identifier
+ PRED_REF = 1,
+ PRED_COMP = 2,
+ PRED_MBSKIP = 3,
+ PRED_SWITCHABLE_INTERP = 4
+} PRED_ID;
+
+extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id);
+
+extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id);
+
+extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ PRED_ID pred_id);
+
+extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
+ PRED_ID pred_id);
+
+extern void vp9_set_pred_flag(MACROBLOCKD *const xd,
+ PRED_ID pred_id,
+ unsigned char pred_flag);
+
+
+extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ int MbIndex);
+
+extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd);
+extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
+
+#endif /* __INC_PRED_COMMON_H__ */
--- /dev/null
+++ b/vp9/common/quant_common.c
@@ -1,0 +1,125 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static int dc_qlookup[QINDEX_RANGE];
+static int ac_qlookup[QINDEX_RANGE];
+
+#define ACDC_MIN 4
+
+void vp9_init_quant_tables() {
+ int i;
+ int current_val = 4;
+ int last_val = 4;
+ int ac_val;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ ac_qlookup[i] = current_val;
+ current_val = (int)((double)current_val * 1.02);
+ if (current_val == last_val)
+ current_val++;
+ last_val = current_val;
+
+ ac_val = ac_qlookup[i];
+ dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +
+ (-0.00065 * ac_val * ac_val) +
+ (0.9 * ac_val) + 0.5;
+ if (dc_qlookup[i] < ACDC_MIN)
+ dc_qlookup[i] = ACDC_MIN;
+ }
+}
+
+int vp9_dc_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp9_dc2quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+
+ return retval;
+
+}
+int vp9_dc_uv_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+
+ return retval;
+}
+
+int vp9_ac_yquant(int QIndex) {
+ int retval;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp9_ac2quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = (ac_qlookup[ QIndex ] * 775) / 1000;
+ if (retval < 4)
+ retval = 4;
+
+ return retval;
+}
+int vp9_ac_uv_quant(int QIndex, int Delta) {
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > MAXQ)
+ QIndex = MAXQ;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
--- /dev/null
+++ b/vp9/common/quant_common.h
@@ -1,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern void vp9_init_quant_tables();
+extern int vp9_ac_yquant(int QIndex);
+extern int vp9_dc_quant(int QIndex, int Delta);
+extern int vp9_dc2quant(int QIndex, int Delta);
+extern int vp9_ac2quant(int QIndex, int Delta);
+extern int vp9_dc_uv_quant(int QIndex, int Delta);
+extern int vp9_ac_uv_quant(int QIndex, int Delta);
--- /dev/null
+++ b/vp9/common/recon.c
@@ -1,0 +1,197 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+
+void vp9_recon_b_c
+(
+ unsigned char *pred_ptr,
+ short *diff_ptr,
+ unsigned char *dst_ptr,
+ int stride
+) {
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ diff_ptr += 16;
+ pred_ptr += 16;
+ }
+}
+
+void vp9_recon_uv_b_c
+(
+ unsigned char *pred_ptr,
+ short *diff_ptr,
+ unsigned char *dst_ptr,
+ int stride
+) {
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ diff_ptr += 8;
+ pred_ptr += 8;
+ }
+}
+void vp9_recon4b_c
+(
+ unsigned char *pred_ptr,
+ short *diff_ptr,
+ unsigned char *dst_ptr,
+ int stride
+) {
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 16; c++) {
+ int a = diff_ptr[c] + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ diff_ptr += 16;
+ pred_ptr += 16;
+ }
+}
+
+void vp9_recon2b_c
+(
+ unsigned char *pred_ptr,
+ short *diff_ptr,
+ unsigned char *dst_ptr,
+ int stride
+) {
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 8; c++) {
+ int a = diff_ptr[c] + pred_ptr[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a;
+ }
+
+ dst_ptr += stride;
+ diff_ptr += 8;
+ pred_ptr += 8;
+ }
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+ int x, y;
+ BLOCKD *b = &xd->block[0];
+ int stride = b->dst_stride;
+ short *diff = b->diff;
+
+ for (y = 0; y < 16; y++) {
+ for (x = 0; x < 16; x++) {
+ int a = dst[x] + diff[x];
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+ dst[x] = a;
+ }
+ dst += stride;
+ diff += 16;
+ }
+}
+
+void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+ int x, y, i;
+ uint8_t *dst = udst;
+
+ for (i = 0; i < 2; i++, dst = vdst) {
+ BLOCKD *b = &xd->block[16 + 4 * i];
+ int stride = b->dst_stride;
+ short *diff = b->diff;
+
+ for (y = 0; y < 8; y++) {
+ for (x = 0; x < 8; x++) {
+ int a = dst[x] + diff[x];
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+ dst[x] = a;
+ }
+ dst += stride;
+ diff += 8;
+ }
+ }
+}
+#endif
+
+void vp9_recon_mby_c(MACROBLOCKD *xd) {
+ int i;
+
+ for (i = 0; i < 16; i += 4) {
+ BLOCKD *b = &xd->block[i];
+
+ vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ }
+}
+
+void vp9_recon_mb_c(MACROBLOCKD *xd) {
+ int i;
+
+ for (i = 0; i < 16; i += 4) {
+ BLOCKD *b = &xd->block[i];
+
+ vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ }
+
+ for (i = 16; i < 24; i += 2) {
+ BLOCKD *b = &xd->block[i];
+
+ vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ }
+}
--- /dev/null
+++ b/vp9/common/reconinter.c
@@ -1,0 +1,1145 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx/vpx_integer.h"
+#include "subpixel.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+ INTERPOLATIONFILTERTYPE mcomp_filter_type,
+ VP9_COMMON *cm) {
+ if (mcomp_filter_type == SIXTAP) {
+ xd->subpixel_predict = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap4x4);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap8x4);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap8x8);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap16x16);
+ xd->subpixel_predict_avg = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap_avg4x4);
+ xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap_avg8x8);
+ xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, sixtap_avg16x16);
+ } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
+ xd->subpixel_predict = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap4x4);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap8x4);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap8x8);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap16x16);
+ xd->subpixel_predict_avg = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg4x4);
+ xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg8x8);
+ xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg16x16);
+ } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
+ xd->subpixel_predict = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap4x4_sharp);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap8x4_sharp);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap8x8_sharp);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap16x16_sharp);
+ xd->subpixel_predict_avg = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg4x4_sharp);
+ xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg8x8_sharp);
+ xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, eighttap_avg16x16_sharp);
+ }
+ else {
+ xd->subpixel_predict = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear4x4);
+ xd->subpixel_predict8x4 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear8x4);
+ xd->subpixel_predict8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear8x8);
+ xd->subpixel_predict16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear16x16);
+ xd->subpixel_predict_avg = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear_avg4x4);
+ xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear_avg8x8);
+ xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+ &cm->rtcd.subpix, bilinear_avg16x16);
+ }
+}
+
+void vp9_copy_mem16x16_c(unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 16; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+ dst[8] = src[8];
+ dst[9] = src[9];
+ dst[10] = src[10];
+ dst[11] = src[11];
+ dst[12] = src[12];
+ dst[13] = src[13];
+ dst[14] = src[14];
+ dst[15] = src[15];
+
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+ ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
+ ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
+
+#endif
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_avg_mem16x16_c(unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 16; r++) {
+ int n;
+
+ for (n = 0; n < 16; n++) {
+ dst[n] = (dst[n] + src[n] + 1) >> 1;
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_copy_mem8x8_c(unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 8; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+#endif
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_avg_mem8x8_c(unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 8; r++) {
+ int n;
+
+ for (n = 0; n < 8; n++) {
+ dst[n] = (dst[n] + src[n] + 1) >> 1;
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_copy_mem8x4_c(unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride) {
+ int r;
+
+ for (r = 0; r < 4; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+#endif
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
+ int r;
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = d->predictor;
+ int_mv mv;
+
+ ptr_base = *(d->base_pre);
+ mv.as_int = d->bmi.as_mv.first.as_int;
+
+ if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+ sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
+ pred_ptr, pitch);
+ } else {
+ ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+ ptr = ptr_base;
+
+ for (r = 0; r < 4; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+ pred_ptr[0] = ptr[0];
+ pred_ptr[1] = ptr[1];
+ pred_ptr[2] = ptr[2];
+ pred_ptr[3] = ptr[3];
+#else
+ *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
+#endif
+ pred_ptr += pitch;
+ ptr += d->pre_stride;
+ }
+ }
+}
+
+/*
+ * Similar to vp9_build_inter_predictors_b(), but instead of storing the
+ * results in d->predictor, we average the contents of d->predictor (which
+ * come from an earlier call to vp9_build_inter_predictors_b()) with the
+ * predictor of the second reference frame / motion vector.
+ */
+void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
+ vp9_subpix_fn_t sppf) {
+ int r;
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = d->predictor;
+ int_mv mv;
+
+ ptr_base = *(d->base_second_pre);
+ mv.as_int = d->bmi.as_mv.second.as_int;
+
+ if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+ sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
+ pred_ptr, pitch);
+ } else {
+ ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+ ptr = ptr_base;
+
+ for (r = 0; r < 4; r++) {
+ pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1;
+ pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1;
+ pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1;
+ pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1;
+ pred_ptr += pitch;
+ ptr += d->pre_stride;
+ }
+ }
+}
+
+void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = d->predictor;
+ int_mv mv;
+
+ ptr_base = *(d->base_pre);
+ mv.as_int = d->bmi.as_mv.first.as_int;
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+
+ if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+ xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+ (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+ } else {
+ vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
+ }
+}
+
+/*
+ * Similar to build_inter_predictors_4b(), but instead of storing the
+ * results in d->predictor, we average the contents of d->predictor (which
+ * come from an earlier call to build_inter_predictors_4b()) with the
+ * predictor of the second reference frame / motion vector.
+ */
+void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+ BLOCKD *d, int pitch) {
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = d->predictor;
+ int_mv mv;
+
+ ptr_base = *(d->base_second_pre);
+ mv.as_int = d->bmi.as_mv.second.as_int;
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+
+ if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+ xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+ (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+ } else {
+ vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
+ }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ unsigned char *pred_ptr = d->predictor;
+ int_mv mv;
+
+ ptr_base = *(d->base_pre);
+ mv.as_int = d->bmi.as_mv.first.as_int;
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
+
+ if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
+ xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
+ (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
+ } else {
+ vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
+ }
+}
+
+
+/*encoder only*/
+#if CONFIG_PRED_FILTER
+
+// Select the thresholded or non-thresholded filter
+#define USE_THRESH_FILTER 0
+
+#define PRED_FILT_LEN 5
+
+static const int filt_shift = 4;
+static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
+// Alternative filter {1, 1, 4, 1, 1}
+
+#if !USE_THRESH_FILTER
+void filter_mb(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int width, int height) {
+ int i, j, k;
+ unsigned int Temp[32 * 32];
+ unsigned int *pTmp = Temp;
+ unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
+
+ // Horizontal
+ for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
+ for (j = 0; j < width; j++) {
+ int sum = 0;
+ for (k = 0; k < PRED_FILT_LEN; k++)
+ sum += pSrc[j + k] * pred_filter[k];
+ pTmp[j] = sum;
+ }
+
+ pSrc += src_stride;
+ pTmp += width;
+ }
+
+ // Vertical
+ pTmp = Temp;
+ for (i = 0; i < width; i++) {
+ unsigned char *pDst = dst + i;
+ for (j = 0; j < height; j++) {
+ int sum = 0;
+ for (k = 0; k < PRED_FILT_LEN; k++)
+ sum += pTmp[(j + k) * width] * pred_filter[k];
+ // Round
+ sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
+ pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
+ }
+ ++pTmp;
+ }
+}
+#else
+// Based on vp9_post_proc_down_and_across_c (postproc.c)
+void filter_mb(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int width, int height) {
+ unsigned char *pSrc, *pDst;
+ int row;
+ int col;
+ int i;
+ int v;
+ unsigned char d[8];
+
+ /* TODO flimit should be linked to the quantizer value */
+ int flimit = 7;
+
+ for (row = 0; row < height; row++) {
+ /* post_proc_down for one row */
+ pSrc = src;
+ pDst = dst;
+
+ for (col = 0; col < width; col++) {
+ int kernel = (1 << (filt_shift - 1));
+ int v = pSrc[col];
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - pSrc[col + i * src_stride]) > flimit)
+ goto down_skip_convolve;
+
+ kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
+ }
+
+ v = (kernel >> filt_shift);
+ down_skip_convolve:
+ pDst[col] = v;
+ }
+
+ /* now post_proc_across */
+ pSrc = dst;
+ pDst = dst;
+
+ for (i = 0; i < 8; i++)
+ d[i] = pSrc[i];
+
+ for (col = 0; col < width; col++) {
+ int kernel = (1 << (filt_shift - 1));
+ v = pSrc[col];
+
+ d[col & 7] = v;
+
+ for (i = -2; i <= 2; i++) {
+ if (abs(v - pSrc[col + i]) > flimit)
+ goto across_skip_convolve;
+
+ kernel += pred_filter[2 + i] * pSrc[col + i];
+ }
+
+ d[col & 7] = (kernel >> filt_shift);
+ across_skip_convolve:
+
+ if (col >= 2)
+ pDst[col - 2] = d[(col - 2) & 7];
+ }
+
+ /* handle the last two pixels */
+ pDst[col - 2] = d[(col - 2) & 7];
+ pDst[col - 1] = d[(col - 1) & 7];
+
+ /* next row */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+#endif // !USE_THRESH_FILTER
+
+#endif // CONFIG_PRED_FILTER
+
+/*encoder only*/
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
+ int i, j;
+ BLOCKD *blockd = xd->block;
+
+ /* build uv mvs */
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+ int temp;
+
+ temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row
+ + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row
+ + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row
+ + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;
+
+ if (temp < 0) temp -= 4;
+ else temp += 4;
+
+ xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+ xd->fullpixel_mask;
+
+ temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col
+ + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col
+ + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col
+ + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;
+
+ if (temp < 0) temp -= 4;
+ else temp += 4;
+
+ blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+ xd->fullpixel_mask;
+
+ blockd[voffset].bmi.as_mv.first.as_mv.row =
+ blockd[uoffset].bmi.as_mv.first.as_mv.row;
+ blockd[voffset].bmi.as_mv.first.as_mv.col =
+ blockd[uoffset].bmi.as_mv.first.as_mv.col;
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row
+ + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row
+ + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row
+ + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;
+
+ if (temp < 0) {
+ temp -= 4;
+ } else {
+ temp += 4;
+ }
+
+ blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+ xd->fullpixel_mask;
+
+ temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col
+ + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col
+ + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col
+ + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;
+
+ if (temp < 0) {
+ temp -= 4;
+ } else {
+ temp += 4;
+ }
+
+ blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+ xd->fullpixel_mask;
+
+ blockd[voffset].bmi.as_mv.second.as_mv.row =
+ blockd[uoffset].bmi.as_mv.second.as_mv.row;
+ blockd[voffset].bmi.as_mv.second.as_mv.col =
+ blockd[uoffset].bmi.as_mv.second.as_mv.col;
+ }
+ }
+ }
+
+ for (i = 16; i < 24; i += 2) {
+ BLOCKD *d0 = &blockd[i];
+ BLOCKD *d1 = &blockd[i + 1];
+
+ if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+ build_inter_predictors2b(xd, d0, 8);
+ else {
+ vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
+ vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+ }
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
+ vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+ }
+ }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+ /* If the MV points so far into the UMV border that no visible pixels
+ * are used for reconstruction, the subpel part of the MV can be
+ * discarded and the MV limited to 16 pixels with equivalent results.
+ *
+ * This limit kicks in at 19 pixels for the top and left edges, for
+ * the 16 pixels plus 3 taps right of the central pixel when subpel
+ * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+ * left of the central pixel when filtering.
+ */
+ if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))
+ mv->col = xd->mb_to_left_edge - (16 << 3);
+ else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))
+ mv->col = xd->mb_to_right_edge + (16 << 3);
+
+ if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))
+ mv->row = xd->mb_to_top_edge - (16 << 3);
+ else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))
+ mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+ mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?
+ (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+ mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?
+ (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+ mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?
+ (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+ mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?
+ (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+/*encoder only*/
+void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ int dst_ystride,
+ int clamp_mvs) {
+ unsigned char *ptr_base = xd->pre.y_buffer;
+ unsigned char *ptr;
+ int pre_stride = xd->block[0].pre_stride;
+ int_mv ymv;
+
+ ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
+
+ if (clamp_mvs)
+ clamp_mv_to_umv_border(&ymv.as_mv, xd);
+
+ ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
+
+#if CONFIG_PRED_FILTER
+ if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+ if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
+ // Sub-pel filter needs extended input
+ int len = 15 + (INTERP_EXTEND << 1);
+ unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+ unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+ // Copy extended MB into Temp array, applying the spatial filter
+ filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+ Temp, len, len, len);
+
+ // Sub-pel interpolation
+ xd->subpixel_predict16x16(pTemp, len,
+ (ymv.as_mv.col & 7) << 1,
+ (ymv.as_mv.row & 7) << 1,
+ dst_y, dst_ystride);
+ } else {
+ // Apply spatial filter to create the prediction directly
+ filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
+ }
+ } else
+#endif
+ if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
+ xd->subpixel_predict16x16(ptr, pre_stride,
+ (ymv.as_mv.col & 7) << 1,
+ (ymv.as_mv.row & 7) << 1,
+ dst_y, dst_ystride);
+ } else {
+ vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+}
+
+void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_uvstride) {
+ int offset;
+ unsigned char *uptr, *vptr;
+ int pre_stride = xd->block[0].pre_stride;
+ int_mv _o16x16mv;
+ int_mv _16x16mv;
+
+ _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
+
+ if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+ _o16x16mv = _16x16mv;
+ /* calc uv motion vectors */
+ if (_16x16mv.as_mv.row < 0)
+ _16x16mv.as_mv.row -= 1;
+ else
+ _16x16mv.as_mv.row += 1;
+
+ if (_16x16mv.as_mv.col < 0)
+ _16x16mv.as_mv.col -= 1;
+ else
+ _16x16mv.as_mv.col += 1;
+
+ _16x16mv.as_mv.row /= 2;
+ _16x16mv.as_mv.col /= 2;
+
+ _16x16mv.as_mv.row &= xd->fullpixel_mask;
+ _16x16mv.as_mv.col &= xd->fullpixel_mask;
+
+ pre_stride >>= 1;
+ offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+ uptr = xd->pre.u_buffer + offset;
+ vptr = xd->pre.v_buffer + offset;
+
+#if CONFIG_PRED_FILTER
+ if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+ int i;
+ unsigned char *pSrc = uptr;
+ unsigned char *pDst = dst_u;
+ int len = 7 + (INTERP_EXTEND << 1);
+ unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
+ unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+ // U & V
+ for (i = 0; i < 2; i++) {
+ if (_o16x16mv.as_int & 0x000f000f) {
+ // Copy extended MB into Temp array, applying the spatial filter
+ filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+ Temp, len, len, len);
+
+ // Sub-pel filter
+ xd->subpixel_predict8x8(pTemp, len,
+ _o16x16mv.as_mv.col & 15,
+ _o16x16mv.as_mv.row & 15,
+ pDst, dst_uvstride);
+ } else {
+ filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
+ }
+
+ // V
+ pSrc = vptr;
+ pDst = dst_v;
+ }
+ } else
+#endif
+ if (_o16x16mv.as_int & 0x000f000f) {
+ xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
+ _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
+ xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
+ _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
+ } else {
+ vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+ vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+ }
+}
+
+
+void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride, int dst_uvstride) {
+ vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
+ xd->mode_info_context->mbmi.need_to_clamp_mvs);
+ vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride) {
+ uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+ uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+ *v2 = x->second_pre.v_buffer;
+ int n;
+
+ for (n = 0; n < 4; n++)
+ {
+ const int x_idx = n & 1, y_idx = n >> 1;
+
+ x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride + x_idx * 16;
+ x->pre.u_buffer = u1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
+ x->pre.v_buffer = v1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
+
+ vp9_build_1st_inter16x16_predictors_mb(x,
+ dst_y + y_idx * 16 * dst_ystride + x_idx * 16,
+ dst_u + y_idx * 8 * dst_uvstride + x_idx * 8,
+ dst_v + y_idx * 8 * dst_uvstride + x_idx * 8,
+ dst_ystride, dst_uvstride);
+ if (x->mode_info_context->mbmi.second_ref_frame) {
+ x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride + x_idx * 16;
+ x->second_pre.u_buffer = u2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
+ x->second_pre.v_buffer = v2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8;
+
+ vp9_build_2nd_inter16x16_predictors_mb(x,
+ dst_y + y_idx * 16 * dst_ystride + x_idx * 16,
+ dst_u + y_idx * 8 * dst_uvstride + x_idx * 8,
+ dst_v + y_idx * 8 * dst_uvstride + x_idx * 8,
+ dst_ystride, dst_uvstride);
+ }
+ }
+
+ x->pre.y_buffer = y1;
+ x->pre.u_buffer = u1;
+ x->pre.v_buffer = v1;
+
+ if (x->mode_info_context->mbmi.second_ref_frame) {
+ x->second_pre.y_buffer = y2;
+ x->second_pre.u_buffer = u2;
+ x->second_pre.v_buffer = v2;
+ }
+}
+#endif
+
+/*
+ * The following functions should be called after an initial
+ * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
+ * It will run a second sixtap filter on a (different) ref
+ * frame and average the result with the output of the
+ * first sixtap filter. The second reference frame is stored
+ * in x->second_pre (the reference frame index is in
+ * x->mode_info_context->mbmi.second_ref_frame). The second
+ * motion vector is x->mode_info_context->mbmi.second_mv.
+ *
+ * This allows blending prediction from two reference frames
+ * which sometimes leads to better prediction than from a
+ * single reference framer.
+ */
+void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ int dst_ystride) {
+ unsigned char *ptr;
+
+ int_mv _16x16mv;
+ int mv_row;
+ int mv_col;
+
+ unsigned char *ptr_base = xd->second_pre.y_buffer;
+ int pre_stride = xd->block[0].pre_stride;
+
+ _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
+
+ if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+ mv_row = _16x16mv.as_mv.row;
+ mv_col = _16x16mv.as_mv.col;
+
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+#if CONFIG_PRED_FILTER
+ if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+ if ((mv_row | mv_col) & 7) {
+ // Sub-pel filter needs extended input
+ int len = 15 + (INTERP_EXTEND << 1);
+ unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+ unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+
+ // Copy extended MB into Temp array, applying the spatial filter
+ filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+ Temp, len, len, len);
+
+ // Sub-pel filter
+ xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
+ (mv_row & 7) << 1, dst_y, dst_ystride);
+ } else {
+ // TODO Needs to AVERAGE with the dst_y
+ // For now, do not apply the prediction filter in these cases!
+ vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+ } else
+#endif // CONFIG_PRED_FILTER
+ {
+ if ((mv_row | mv_col) & 7) {
+ xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
+ (mv_row & 7) << 1, dst_y, dst_ystride);
+ } else {
+ vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+ }
+}
+
+void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_uvstride) {
+ int offset;
+ unsigned char *uptr, *vptr;
+
+ int_mv _16x16mv;
+ int mv_row;
+ int mv_col;
+ int omv_row, omv_col;
+
+ int pre_stride = xd->block[0].pre_stride;
+
+ _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
+
+ if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+ mv_row = _16x16mv.as_mv.row;
+ mv_col = _16x16mv.as_mv.col;
+
+ /* calc uv motion vectors */
+ omv_row = mv_row;
+ omv_col = mv_col;
+ mv_row = (mv_row + (mv_row > 0)) >> 1;
+ mv_col = (mv_col + (mv_col > 0)) >> 1;
+
+ mv_row &= xd->fullpixel_mask;
+ mv_col &= xd->fullpixel_mask;
+
+ pre_stride >>= 1;
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = xd->second_pre.u_buffer + offset;
+ vptr = xd->second_pre.v_buffer + offset;
+
+#if CONFIG_PRED_FILTER
+ if (xd->mode_info_context->mbmi.pred_filter_enabled) {
+ int i;
+ int len = 7 + (INTERP_EXTEND << 1);
+ unsigned char Temp[32 * 32]; // Data required by sub-pel filter
+ unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);
+ unsigned char *pSrc = uptr;
+ unsigned char *pDst = dst_u;
+
+ // U & V
+ for (i = 0; i < 2; i++) {
+ if ((omv_row | omv_col) & 15) {
+ // Copy extended MB into Temp array, applying the spatial filter
+ filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
+ Temp, len, len, len);
+
+ // Sub-pel filter
+ xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
+ omv_row & 15, pDst, dst_uvstride);
+ } else {
+ // TODO Needs to AVERAGE with the dst_[u|v]
+ // For now, do not apply the prediction filter here!
+ vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
+ }
+
+ // V
+ pSrc = vptr;
+ pDst = dst_v;
+ }
+ } else
+#endif // CONFIG_PRED_FILTER
+ if ((omv_row | omv_col) & 15) {
+ xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
+ omv_row & 15, dst_u, dst_uvstride);
+ xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
+ omv_row & 15, dst_v, dst_uvstride);
+ } else {
+ vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+ vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+ }
+}
+
+void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride) {
+ vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
+ vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
+ int i;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+ BLOCKD *blockd = xd->block;
+
+ if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
+ blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
+ blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
+ blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
+ blockd[10].bmi = xd->mode_info_context->bmi[10];
+
+ if (mbmi->need_to_clamp_mvs) {
+ clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);
+ if (mbmi->second_ref_frame) {
+ clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);
+ }
+ }
+
+
+ vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
+ vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
+ vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
+ vp9_build_inter_predictors4b(xd, &blockd[10], 16);
+
+ if (mbmi->second_ref_frame) {
+ vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
+ vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
+ vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
+ vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
+ }
+ } else {
+ for (i = 0; i < 16; i += 2) {
+ BLOCKD *d0 = &blockd[i];
+ BLOCKD *d1 = &blockd[i + 1];
+
+ blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
+ blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
+
+ if (mbmi->need_to_clamp_mvs) {
+ clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);
+ if (mbmi->second_ref_frame) {
+ clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);
+ clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);
+ }
+ }
+
+ if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+ build_inter_predictors2b(xd, d0, 16);
+ else {
+ vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
+ vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
+ }
+
+ if (mbmi->second_ref_frame) {
+ vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
+ vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
+ }
+ }
+ }
+
+ for (i = 16; i < 24; i += 2) {
+ BLOCKD *d0 = &blockd[i];
+ BLOCKD *d1 = &blockd[i + 1];
+
+ if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
+ build_inter_predictors2b(xd, d0, 8);
+ else {
+ vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
+ vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+ }
+
+ if (mbmi->second_ref_frame) {
+ vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
+ vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+ }
+ }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *xd) {
+ int i, j;
+ BLOCKD *blockd = xd->block;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;
+
+ if (temp < 0) temp -= 4;
+ else temp += 4;
+
+ blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &
+ xd->fullpixel_mask;
+
+ temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;
+
+ if (temp < 0) temp -= 4;
+ else temp += 4;
+
+ blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &
+ xd->fullpixel_mask;
+
+ // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+
+ // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);
+
+ blockd[voffset].bmi.as_mv.first.as_mv.row =
+ blockd[uoffset].bmi.as_mv.first.as_mv.row;
+ blockd[voffset].bmi.as_mv.first.as_mv.col =
+ blockd[uoffset].bmi.as_mv.first.as_mv.col;
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row
+ + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;
+
+ if (temp < 0) {
+ temp -= 4;
+ } else {
+ temp += 4;
+ }
+
+ blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &
+ xd->fullpixel_mask;
+
+ temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col
+ + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;
+
+ if (temp < 0) {
+ temp -= 4;
+ } else {
+ temp += 4;
+ }
+
+ blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &
+ xd->fullpixel_mask;
+
+ // if (mbmi->need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(
+ &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+
+ // if (mbmi->need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(
+ &blockd[uoffset].bmi.as_mv.second.as_mv, xd);
+
+ blockd[voffset].bmi.as_mv.second.as_mv.row =
+ blockd[uoffset].bmi.as_mv.second.as_mv.row;
+ blockd[voffset].bmi.as_mv.second.as_mv.col =
+ blockd[uoffset].bmi.as_mv.second.as_mv.col;
+ }
+ }
+ }
+}
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
+ if (xd->mode_info_context->mbmi.mode != SPLITMV) {
+ vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
+ &xd->predictor[256],
+ &xd->predictor[320], 16, 8);
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ /* 256 = offset of U plane in Y+U+V buffer;
+ * 320 = offset of V plane in Y+U+V buffer.
+ * (256=16x16, 320=16x16+8x8). */
+ vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
+ &xd->predictor[256],
+ &xd->predictor[320], 16, 8);
+ }
+ } else {
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb(xd);
+ }
+}
--- /dev/null
+++ b/vp9/common/reconinter.h
@@ -1,0 +1,78 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_RECONINTER_H
+#define __INC_RECONINTER_H
+
+#include "onyxc_int.h"
+
+extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ int dst_ystride,
+ int clamp_mvs);
+
+extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_uvstride);
+
+extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ int dst_ystride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_uvstride);
+
+extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+
+#if CONFIG_SUPERBLOCKS
+extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+#endif
+
+extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
+
+extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
+ vp9_subpix_fn_t sppf);
+
+extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
+ vp9_subpix_fn_t sppf);
+
+extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
+ int pitch);
+
+extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
+ BLOCKD *d, int pitch);
+
+extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
+
+extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
+ INTERPOLATIONFILTERTYPE filter,
+ VP9_COMMON *cm);
+
+#endif // __INC_RECONINTER_H
--- /dev/null
+++ b/vp9/common/reconintra.c
@@ -1,0 +1,490 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
+ * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
+ */
+
+static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c, h, w, v;
+ int a, b;
+ r = 0;
+ for (c = 0; c < n - 2; c++) {
+ if (c & 1)
+ a = yleft_col[r + 1];
+ else
+ a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
+ b = yabove_row[c + 2];
+ ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+ }
+ for (r = 1; r < n / 2 - 1; r++) {
+ for (c = 0; c < n - 2 - 2 * r; c++) {
+ if (c & 1)
+ a = yleft_col[r + 1];
+ else
+ a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;
+ b = ypred_ptr[(r - 1) * y_stride + c + 2];
+ ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+ }
+ }
+ for (; r < n - 1; ++r) {
+ for (c = 0; c < n; c++) {
+ v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);
+ h = r - c / 2;
+ ypred_ptr[h * y_stride + c] = v;
+ }
+ }
+ c = 0;
+ r = n - 1;
+ ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +
+ yleft_col[r] + 1) >> 1;
+ for (r = n - 2; r >= n / 2; --r) {
+ w = c + (n - 1 - r) * 2;
+ ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
+ ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+ }
+ for (c = 1; c < n; c++) {
+ for (r = n - 1; r >= n / 2 + c / 2; --r) {
+ w = c + (n - 1 - r) * 2;
+ ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +
+ ypred_ptr[r * y_stride + w - 1] + 1) >> 1;
+ }
+ }
+}
+
+static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c, h, w, v;
+ int a, b;
+ c = 0;
+ for (r = 0; r < n - 2; r++) {
+ if (r & 1)
+ a = yabove_row[c + 1];
+ else
+ a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
+ b = yleft_col[r + 2];
+ ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);
+ }
+ for (c = 1; c < n / 2 - 1; c++) {
+ for (r = 0; r < n - 2 - 2 * c; r++) {
+ if (r & 1)
+ a = yabove_row[c + 1];
+ else
+ a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;
+ b = ypred_ptr[(r + 2) * y_stride + c - 1];
+ ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);
+ }
+ }
+ for (; c < n - 1; ++c) {
+ for (r = 0; r < n; r++) {
+ v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);
+ w = c - r / 2;
+ ypred_ptr[r * y_stride + w] = v;
+ }
+ }
+ r = 0;
+ c = n - 1;
+ ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;
+ for (c = n - 2; c >= n / 2; --c) {
+ h = r + (n - 1 - c) * 2;
+ ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
+ ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+ }
+ for (r = 1; r < n; r++) {
+ for (c = n - 1; c >= n / 2 + r / 2; --c) {
+ h = r + (n - 1 - c) * 2;
+ ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +
+ ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;
+ }
+ }
+}
+
+static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c;
+ for (r = 0; r < n - 1; ++r) {
+ for (c = 0; c <= r; ++c) {
+ ypred_ptr[(r - c) * y_stride + c] =
+ (yabove_row[r + 1] * (c + 1) +
+ yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);
+ }
+ }
+ for (c = 0; c <= r; ++c) {
+ int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
+ int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
+ yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
+ yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
+ ypred_ptr[(r - c) * y_stride + c] =
+ (yabove_ext * (c + 1) +
+ yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
+ }
+ for (r = 1; r < n; ++r) {
+ for (c = n - r; c < n; ++c)
+ ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
+ ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
+ }
+}
+
+static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c;
+ for (c = 0; c < n; c++)
+ ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;
+ ypred_ptr += y_stride;
+ for (c = 0; c < n; c++)
+ ypred_ptr[c] = yabove_row[c - 1];
+ ypred_ptr += y_stride;
+ for (r = 2; r < n; ++r) {
+ ypred_ptr[0] = yleft_col[r - 2];
+ for (c = 1; c < n; c++)
+ ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
+ ypred_ptr += y_stride;
+ }
+}
+
+static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c;
+ ypred_ptr[0] = yabove_row[-1];
+ for (c = 1; c < n; c++)
+ ypred_ptr[c] = yabove_row[c - 1];
+ for (r = 1; r < n; ++r)
+ ypred_ptr[r * y_stride] = yleft_col[r - 1];
+
+ ypred_ptr += y_stride;
+ for (r = 1; r < n; ++r) {
+ for (c = 1; c < n; c++) {
+ ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
+ }
+ ypred_ptr += y_stride;
+ }
+}
+
+static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+ uint8_t *yabove_row, uint8_t *yleft_col) {
+ int r, c;
+ ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;
+ for (r = 1; r < n; r++)
+ ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;
+ ypred_ptr++;
+ ypred_ptr[0] = yabove_row[-1];
+ for (r = 1; r < n; r++)
+ ypred_ptr[r * y_stride] = yleft_col[r - 1];
+ ypred_ptr++;
+
+ for (c = 0; c < n - 2; c++)
+ ypred_ptr[c] = yabove_row[c];
+ ypred_ptr += y_stride;
+ for (r = 1; r < n; ++r) {
+ for (c = 0; c < n - 2; c++)
+ ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
+ ypred_ptr += y_stride;
+ }
+}
+
+void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
+ int i;
+
+ for (i = 16; i < 24; i += 2) {
+ BLOCKD *b = &xd->block[i];
+ vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ }
+}
+
+void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
+ unsigned char *ypred_ptr,
+ int y_stride, int mode, int bsize,
+ int up_available, int left_available) {
+
+ unsigned char *yabove_row = src - src_stride;
+ unsigned char yleft_col[32];
+ unsigned char ytop_left = yabove_row[-1];
+ int r, c, i;
+
+ for (i = 0; i < bsize; i++) {
+ yleft_col[i] = src[i * src_stride - 1];
+ }
+
+ /* for Y */
+ switch (mode) {
+ case DC_PRED: {
+ int expected_dc;
+ int i;
+ int shift;
+ int average = 0;
+ int log2_bsize_minus_1;
+
+ assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+ if (bsize == 4) {
+ log2_bsize_minus_1 = 1;
+ } else if (bsize == 8) {
+ log2_bsize_minus_1 = 2;
+ } else if (bsize == 16) {
+ log2_bsize_minus_1 = 3;
+ } else /* bsize == 32 */ {
+ log2_bsize_minus_1 = 4;
+ }
+
+ if (up_available || left_available) {
+ if (up_available) {
+ for (i = 0; i < bsize; i++) {
+ average += yabove_row[i];
+ }
+ }
+
+ if (left_available) {
+ for (i = 0; i < bsize; i++) {
+ average += yleft_col[i];
+ }
+ }
+ shift = log2_bsize_minus_1 + up_available + left_available;
+ expected_dc = (average + (1 << (shift - 1))) >> shift;
+ } else {
+ expected_dc = 128;
+ }
+
+ for (r = 0; r < bsize; r++) {
+ vpx_memset(ypred_ptr, expected_dc, bsize);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case V_PRED: {
+ for (r = 0; r < bsize; r++) {
+ memcpy(ypred_ptr, yabove_row, bsize);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case H_PRED: {
+ for (r = 0; r < bsize; r++) {
+ vpx_memset(ypred_ptr, yleft_col[r], bsize);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case TM_PRED: {
+ for (r = 0; r < bsize; r++) {
+ for (c = 0; c < bsize; c++) {
+ int pred = yleft_col[r] + yabove_row[ c] - ytop_left;
+
+ if (pred < 0)
+ pred = 0;
+
+ if (pred > 255)
+ pred = 255;
+
+ ypred_ptr[c] = pred;
+ }
+
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case D45_PRED: {
+ d45_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case D135_PRED: {
+ d135_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case D117_PRED: {
+ d117_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case D153_PRED: {
+ d153_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case D27_PRED: {
+ d27_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case D63_PRED: {
+ d63_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col);
+ }
+ break;
+ case I8X8_PRED:
+ case B_PRED:
+ case NEARESTMV:
+ case NEARMV:
+ case ZEROMV:
+ case NEWMV:
+ case SPLITMV:
+ case MB_MODE_COUNT:
+ break;
+ }
+}
+
+void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ xd->predictor, 16,
+ xd->mode_info_context->mbmi.mode, 16,
+ xd->up_available, xd->left_available);
+}
+
+void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ xd->dst.y_buffer, xd->dst.y_stride,
+ xd->mode_info_context->mbmi.mode, 16,
+ xd->up_available, xd->left_available);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ xd->dst.y_buffer, xd->dst.y_stride,
+ xd->mode_info_context->mbmi.mode, 32,
+ xd->up_available, xd->left_available);
+}
+#endif
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
+ unsigned char predictor[2][256];
+ int i;
+
+ vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ predictor[0], 16,
+ xd->mode_info_context->mbmi.mode,
+ 16, xd->up_available,
+ xd->left_available);
+ vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ predictor[1], 16,
+ xd->mode_info_context->mbmi.second_mode,
+ 16, xd->up_available,
+ xd->left_available);
+
+ for (i = 0; i < 256; i++) {
+ xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
+ }
+}
+#endif
+
+void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
+ unsigned char *upred_ptr,
+ unsigned char *vpred_ptr,
+ int uv_stride,
+ int mode, int bsize) {
+ vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+ upred_ptr, uv_stride, mode, bsize,
+ xd->up_available, xd->left_available);
+ vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+ vpred_ptr, uv_stride, mode, bsize,
+ xd->up_available, xd->left_available);
+}
+
+void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
+ &xd->predictor[320], 8,
+ xd->mode_info_context->mbmi.uv_mode,
+ 8);
+}
+
+void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+ xd->dst.v_buffer,
+ xd->dst.uv_stride,
+ xd->mode_info_context->mbmi.uv_mode,
+ 8);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
+ vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.uv_stride,
+ xd->mode_info_context->mbmi.uv_mode,
+ 16);
+}
+#endif
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
+ unsigned char predictor[2][2][64];
+ int i;
+
+ vp9_build_intra_predictors_mbuv_internal(
+ xd, predictor[0][0], predictor[1][0], 8,
+ xd->mode_info_context->mbmi.uv_mode, 8);
+ vp9_build_intra_predictors_mbuv_internal(
+ xd, predictor[0][1], predictor[1][1], 8,
+ xd->mode_info_context->mbmi.second_uv_mode, 8);
+ for (i = 0; i < 64; i++) {
+ xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
+ xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
+ predictor[1][1][i] + 1) >> 1;
+ }
+}
+#endif
+
+void vp9_intra8x8_predict(BLOCKD *xd,
+ int mode,
+ unsigned char *predictor) {
+ vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+ xd->dst_stride, predictor, 16,
+ mode, 8, 1, 1);
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra8x8_predict(BLOCKD *xd,
+ int mode, int second_mode,
+ unsigned char *out_predictor) {
+ unsigned char predictor[2][8 * 16];
+ int i, j;
+
+ vp9_intra8x8_predict(xd, mode, predictor[0]);
+ vp9_intra8x8_predict(xd, second_mode, predictor[1]);
+
+ for (i = 0; i < 8 * 16; i += 16) {
+ for (j = i; j < i + 8; j++) {
+ out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+ }
+ }
+}
+#endif
+
+void vp9_intra_uv4x4_predict(BLOCKD *xd,
+ int mode,
+ unsigned char *predictor) {
+ vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+ xd->dst_stride, predictor, 8,
+ mode, 4, 1, 1);
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
+ int mode, int mode2,
+ unsigned char *out_predictor) {
+ unsigned char predictor[2][8 * 4];
+ int i, j;
+
+ vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
+ vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
+
+ for (i = 0; i < 4 * 8; i += 8) {
+ for (j = i; j < i + 4; j++) {
+ out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+ }
+ }
+}
+#endif
+
+/* TODO: try different ways of use Y-UV mode correlation
+ Current code assumes that a uv 4x4 block use same mode
+ as corresponding Y 8x8 area
+ */
--- /dev/null
+++ b/vp9/common/reconintra.h
@@ -1,0 +1,18 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_RECONINTRA_H
+#define __INC_RECONINTRA_H
+
+#include "blockd.h"
+
+extern void init_intra_left_above_pixels(MACROBLOCKD *xd);
+
+#endif // __INC_RECONINTRA_H
--- /dev/null
+++ b/vp9/common/reconintra4x4.c
@@ -1,0 +1,321 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "reconintra.h"
+#include "vpx_rtcd.h"
+
+void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,
+ unsigned char *predictor) {
+ int i, r, c;
+
+ unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
+ unsigned char Left[4];
+ unsigned char top_left = Above[-1];
+
+ Left[0] = (*(x->base_dst))[x->dst - 1];
+ Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+ Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+ Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+
+ switch (b_mode) {
+ case B_DC_PRED: {
+ int expected_dc = 0;
+
+ for (i = 0; i < 4; i++) {
+ expected_dc += Above[i];
+ expected_dc += Left[i];
+ }
+
+ expected_dc = (expected_dc + 4) >> 3;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ predictor[c] = expected_dc;
+ }
+
+ predictor += 16;
+ }
+ }
+ break;
+ case B_TM_PRED: {
+ /* prediction similar to true_motion prediction */
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int pred = Above[c] - top_left + Left[r];
+
+ if (pred < 0)
+ pred = 0;
+
+ if (pred > 255)
+ pred = 255;
+
+ predictor[c] = pred;
+ }
+
+ predictor += 16;
+ }
+ }
+ break;
+
+ case B_VE_PRED: {
+
+ unsigned int ap[4];
+ ap[0] = Above[0];
+ ap[1] = Above[1];
+ ap[2] = Above[2];
+ ap[3] = Above[3];
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+
+ predictor[c] = ap[c];
+ }
+
+ predictor += 16;
+ }
+
+ }
+ break;
+
+
+ case B_HE_PRED: {
+
+ unsigned int lp[4];
+ lp[0] = Left[0];
+ lp[1] = Left[1];
+ lp[2] = Left[2];
+ lp[3] = Left[3];
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ predictor[c] = lp[r];
+ }
+
+ predictor += 16;
+ }
+ }
+ break;
+ case B_LD_PRED: {
+ unsigned char *ptr = Above;
+ predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+ predictor[0 * 16 + 1] =
+ predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+ predictor[0 * 16 + 2] =
+ predictor[1 * 16 + 1] =
+ predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+ predictor[0 * 16 + 3] =
+ predictor[1 * 16 + 2] =
+ predictor[2 * 16 + 1] =
+ predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+ predictor[1 * 16 + 3] =
+ predictor[2 * 16 + 2] =
+ predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+ predictor[2 * 16 + 3] =
+ predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+ predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+ }
+ break;
+ case B_RD_PRED: {
+
+ unsigned char pp[9];
+
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+ predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[3 * 16 + 1] =
+ predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[3 * 16 + 2] =
+ predictor[2 * 16 + 1] =
+ predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * 16 + 3] =
+ predictor[2 * 16 + 2] =
+ predictor[1 * 16 + 1] =
+ predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * 16 + 3] =
+ predictor[1 * 16 + 2] =
+ predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[1 * 16 + 3] =
+ predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+ }
+ break;
+ case B_VR_PRED: {
+
+ unsigned char pp[9];
+
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+
+ predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * 16 + 1] =
+ predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * 16 + 1] =
+ predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
+ predictor[3 * 16 + 2] =
+ predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[2 * 16 + 2] =
+ predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
+ predictor[3 * 16 + 3] =
+ predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[2 * 16 + 3] =
+ predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
+ predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+ }
+ break;
+ case B_VL_PRED: {
+
+ unsigned char *pp = Above;
+
+ predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * 16 + 0] =
+ predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[1 * 16 + 1] =
+ predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * 16 + 1] =
+ predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[3 * 16 + 1] =
+ predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[0 * 16 + 3] =
+ predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * 16 + 3] =
+ predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ }
+ break;
+
+ case B_HD_PRED: {
+ unsigned char pp[9];
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+
+ predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * 16 + 0] =
+ predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[2 * 16 + 1] =
+ predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * 16 + 2] =
+ predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[2 * 16 + 3] =
+ predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[1 * 16 + 2] =
+ predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * 16 + 3] =
+ predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ }
+ break;
+
+
+ case B_HU_PRED: {
+ unsigned char *pp = Left;
+ predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[0 * 16 + 2] =
+ predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[0 * 16 + 3] =
+ predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[1 * 16 + 2] =
+ predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[1 * 16 + 3] =
+ predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * 16 + 2] =
+ predictor[2 * 16 + 3] =
+ predictor[3 * 16 + 0] =
+ predictor[3 * 16 + 1] =
+ predictor[3 * 16 + 2] =
+ predictor[3 * 16 + 3] = pp[3];
+ }
+ break;
+
+
+ }
+}
+
+#if CONFIG_COMP_INTRA_PRED
+void vp9_comp_intra4x4_predict_c(BLOCKD *x,
+ int b_mode, int b_mode2,
+ unsigned char *out_predictor) {
+ unsigned char predictor[2][4 * 16];
+ int i, j;
+
+ vp9_intra4x4_predict(x, b_mode, predictor[0]);
+ vp9_intra4x4_predict(x, b_mode2, predictor[1]);
+
+ for (i = 0; i < 16 * 4; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
+ }
+ }
+}
+#endif
+
+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
+ * to the right prediction have filled in pixels to use.
+ */
+void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
+ int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
+ unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
+ xd->block[0].dst_stride + 16;
+ unsigned int *src_ptr = (unsigned int *)
+ (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
+
+ unsigned int *dst_ptr0 = (unsigned int *)above_right;
+ unsigned int *dst_ptr1 =
+ (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
+ unsigned int *dst_ptr2 =
+ (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
+ unsigned int *dst_ptr3 =
+ (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
+
+ if (extend_edge) {
+ *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
+ }
+
+ *dst_ptr0 = *src_ptr;
+ *dst_ptr1 = *src_ptr;
+ *dst_ptr2 = *src_ptr;
+ *dst_ptr3 = *src_ptr;
+}
--- /dev/null
+++ b/vp9/common/reconintra4x4.h
@@ -1,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA4x4_H
+#define __INC_RECONINTRA4x4_H
+
+extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
+
+#endif
--- /dev/null
+++ b/vp9/common/rtcd.c
@@ -1,0 +1,105 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_config.h"
+#define RTCD_C
+#include "vpx_rtcd.h"
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+static void once(void (*func)(void))
+{
+ static CRITICAL_SECTION *lock;
+ static LONG waiters;
+ static int done;
+ void *lock_ptr = &lock;
+
+ /* If the initialization is complete, return early. This isn't just an
+ * optimization, it prevents races on the destruction of the global
+ * lock.
+ */
+ if(done)
+ return;
+
+ InterlockedIncrement(&waiters);
+
+ /* Get a lock. We create one and try to make it the one-true-lock,
+ * throwing it away if we lost the race.
+ */
+
+ {
+ /* Scope to protect access to new_lock */
+ CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+ InitializeCriticalSection(new_lock);
+ if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
+ {
+ DeleteCriticalSection(new_lock);
+ free(new_lock);
+ }
+ }
+
+ /* At this point, we have a lock that can be synchronized on. We don't
+ * care which thread actually performed the allocation.
+ */
+
+ EnterCriticalSection(lock);
+
+ if (!done)
+ {
+ func();
+ done = 1;
+ }
+
+ LeaveCriticalSection(lock);
+
+ /* Last one out should free resources. The destructed objects are
+ * protected by checking if(done) above.
+ */
+ if(!InterlockedDecrement(&waiters))
+ {
+ DeleteCriticalSection(lock);
+ free(lock);
+ lock = NULL;
+ }
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+ static pthread_once_t lock = PTHREAD_ONCE_INIT;
+ pthread_once(&lock, func);
+}
+
+
+#else
+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+ static int done;
+
+ if(!done)
+ {
+ func();
+ done = 1;
+ }
+}
+#endif
+
+
+void vpx_rtcd()
+{
+ once(setup_rtcd_internal);
+}
--- /dev/null
+++ b/vp9/common/rtcd_defs.sh
@@ -1,0 +1,482 @@
+common_forward_decls() {
+cat <<EOF
+
+struct loop_filter_info;
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+
+/* Encoder forward decls */
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls common_forward_decls
+
+prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+
+# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
+# compiles warning free but a dissassembly of generated code show bugs. To be
+# on the safe side, only enabled when compiled with 'gcc'.
+if [ "$CONFIG_GCC" = "yes" ]; then
+ specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
+fi
+ specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
+ specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
+ specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
+
+#
+# Dequant
+#
+prototype void vp9_dequantize_b "struct blockd *x"
+specialize vp9_dequantize_b mmx
+
+prototype void vp9_dequantize_b_2x2 "struct blockd *x"
+specialize vp9_dequantize_b_2x2
+
+prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"
+specialize vp9_dequant_dc_idct_add_y_block_8x8
+
+prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"
+specialize vp9_dequant_idct_add_y_block_8x8
+
+prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"
+specialize vp9_dequant_idct_add_uv_block_8x8
+
+prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+specialize vp9_dequant_idct_add_16x16
+
+prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+specialize vp9_dequant_idct_add
+
+prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
+specialize vp9_dequant_dc_idct_add
+
+prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"
+specialize vp9_dequant_dc_idct_add_y_block mmx
+
+prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"
+specialize vp9_dequant_idct_add_y_block mmx
+
+prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"
+specialize vp9_dequant_idct_add_uv_block mmx
+
+#
+# RECON
+#
+prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
+vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
+vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
+
+prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x8 mmx media neon dspr2
+vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
+vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
+
+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x4 mmx
+
+prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
+specialize vp9_intra4x4_predict
+
+prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_avg_mem16x16
+
+prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_avg_mem8x8
+
+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp9_copy_mem8x4 mmx media neon dspr2
+vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
+vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
+
+prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon_b
+
+prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon_uv_b
+
+prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon2b sse2
+
+prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+specialize vp9_recon4b sse2
+
+prototype void vp9_recon_mb "struct macroblockd *x"
+specialize vp9_recon_mb
+
+prototype void vp9_recon_mby "struct macroblockd *x"
+specialize vp9_recon_mby
+
+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby_s
+
+prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sby_s;
+
+prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sbuv_s;
+
+prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby;
+
+prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
+specialize vp9_build_comp_intra_predictors_mby;
+
+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mby_s;
+
+prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mbuv;
+
+prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_mbuv_s;
+
+prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
+specialize vp9_build_comp_intra_predictors_mbuv;
+
+prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra4x4_predict;
+
+prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra4x4_predict;
+
+prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra8x8_predict;
+
+prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra8x8_predict;
+
+prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+specialize vp9_intra_uv4x4_predict;
+
+prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
+specialize vp9_comp_intra_uv4x4_predict;
+
+#
+# Loopfilter
+#
+prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_mbv sse2
+
+prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bv sse2
+
+prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bv8x8 sse2
+
+prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_mbh sse2
+
+prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bh sse2
+
+prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_loop_filter_bh8x8 sse2
+
+prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
+vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
+vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
+vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
+vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
+vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
+
+prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
+vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
+vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
+vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
+vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
+vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
+
+prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_bv mmx sse2 media neon
+vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
+vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
+vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
+vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
+vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
+
+prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp9_loop_filter_simple_bh mmx sse2 media neon
+vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
+vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
+vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
+vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
+vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
+
+#
+# sad 16x3, 3x16
+#
+if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then
+prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp9_sad16x3 sse2
+
+prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+specialize vp9_sad3x16 sse2
+fi
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
+
+
+# variance
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
+prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance32x32
+
+prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance16x16 mmx sse2
+vp9_variance16x16_sse2=vp9_variance16x16_wmt
+vp9_variance16x16_mmx=vp9_variance16x16_mmx
+
+prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance16x8 mmx sse2
+vp9_variance16x8_sse2=vp9_variance16x8_wmt
+vp9_variance16x8_mmx=vp9_variance16x8_mmx
+
+prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x16 mmx sse2
+vp9_variance8x16_sse2=vp9_variance8x16_wmt
+vp9_variance8x16_mmx=vp9_variance8x16_mmx
+
+prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x8 mmx sse2
+vp9_variance8x8_sse2=vp9_variance8x8_wmt
+vp9_variance8x8_mmx=vp9_variance8x8_mmx
+
+prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance4x4 mmx sse2
+vp9_variance4x4_sse2=vp9_variance4x4_wmt
+vp9_variance4x4_mmx=vp9_variance4x4_mmx
+
+prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance32x32
+
+prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
+
+prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x16 sse2 mmx
+vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x8 sse2 mmx
+vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+
+prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance4x4 sse2 mmx
+vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad32x32
+
+prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad16x16 mmx sse2 sse3
+vp9_sad16x16_sse2=vp9_sad16x16_wmt
+
+prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad16x8 mmx sse2
+vp9_sad16x8_sse2=vp9_sad16x8_wmt
+
+prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad8x16 mmx sse2
+vp9_sad8x16_sse2=vp9_sad8x16_wmt
+
+prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad8x8 mmx sse2
+vp9_sad8x8_sse2=vp9_sad8x8_wmt
+
+prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad4x4 mmx sse2
+vp9_sad4x4_sse2=vp9_sad4x4_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_h mmx sse2
+vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_v mmx sse2
+vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
+
+prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
+
+prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_h
+
+prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_v
+
+prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar32x32_hv
+
+prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x32x3
+
+prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x16x3 sse3 ssse3
+
+prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x8x3 sse3 ssse3
+
+prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x16x3 sse3
+
+prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x8x3 sse3
+
+prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x4x3 sse3
+
+prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad32x32x8
+
+prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad16x16x8 sse4
+
+prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad16x8x8 sse4
+
+prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad8x16x8 sse4
+
+prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad8x8x8 sse4
+
+prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp9_sad4x4x8 sse4
+
+prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x32x4d
+
+prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x16x4d sse3
+
+prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x8x4d sse3
+
+prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x16x4d sse3
+
+prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x8x4d sse3
+
+prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x4x4d sse3
+
+#
+# Block copy
+#
+case $arch in
+ x86*)
+ prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+ specialize vp9_copy32xn sse2 sse3
+ ;;
+esac
+
+prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+specialize vp9_sub_pixel_mse16x16 sse2 mmx
+vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse"
+specialize vp9_mse16x16 mmx sse2
+vp9_mse16x16_sse2=vp9_mse16x16_wmt
+
+prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_mse32x32
+
+prototype unsigned int vp9_get_mb_ss "const short *"
+specialize vp9_get_mb_ss mmx sse2
+# ENCODEMB INVOKE
+prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
+specialize vp9_mbblock_error mmx sse2
+vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
+
+prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
+specialize vp9_block_error mmx sse2
+vp9_block_error_sse2=vp9_block_error_xmm
+
+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp9_subtract_b mmx sse2
+
+prototype int vp9_mbuverror "struct macroblock *mb"
+specialize vp9_mbuverror mmx sse2
+vp9_mbuverror_sse2=vp9_mbuverror_xmm
+
+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp9_subtract_b mmx sse2
+
+prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
+specialize vp9_subtract_mby mmx sse2
+
+prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
+specialize vp9_subtract_mbuv mmx sse2
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+ [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+ prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+ specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
+
+ prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+ specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+# fdct functions
+prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
+specialize vp9_fht
+
+prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct8x8
+
+prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fhaar2x2
+
+prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct4x4
+
+prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct8x4
+
+prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4
+
+prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct16x16
+
+prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4_lossless
+
+prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh4x4_x8
+
+prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_walsh8x4_x8
+
+fi
+# end encoder functions
--- /dev/null
+++ b/vp9/common/sadmxn.h
@@ -1,0 +1,37 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_SAD_H
+#define __INC_SAD_H
+
+static __inline
+unsigned int sad_mx_n_c(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int m,
+ int n) {
+ int r, c;
+ unsigned int sad = 0;
+
+ for (r = 0; r < n; r++) {
+ for (c = 0; c < m; c++) {
+ sad += abs(src_ptr[c] - ref_ptr[c]);
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ return sad;
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/seg_common.c
@@ -1,0 +1,103 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/seg_common.h"
+
+static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
+static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+int vp9_segfeature_active(const MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ // Return true if mask bit set and segmentation enabled.
+ return (xd->segmentation_enabled &&
+ (xd->segment_feature_mask[segment_id] &
+ (0x01 << feature_id)));
+}
+
+void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
+ vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+ vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
+}
+
+void vp9_enable_segfeature(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);
+}
+
+void vp9_disable_segfeature(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {
+ return seg_feature_data_bits[feature_id];
+}
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+ return (segfeaturedata_signed[feature_id]);
+}
+
+void vp9_clear_segdata(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ xd->segment_feature_data[segment_id][feature_id] = 0;
+}
+
+void vp9_set_segdata(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id,
+ int seg_data) {
+ xd->segment_feature_data[segment_id][feature_id] = seg_data;
+}
+
+int vp9_get_segdata(const MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ return xd->segment_feature_data[segment_id][feature_id];
+}
+
+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {
+ xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;
+}
+
+void vp9_set_segref(MACROBLOCKD *xd,
+ int segment_id,
+ MV_REFERENCE_FRAME ref_frame) {
+ xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=
+ (1 << ref_frame);
+}
+
+int vp9_check_segref(const MACROBLOCKD *xd,
+ int segment_id,
+ MV_REFERENCE_FRAME ref_frame) {
+ return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
+ (1 << ref_frame)) ? 1 : 0;
+}
+
+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {
+ return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
+ ~(1 << INTRA_FRAME)) ? 1 : 0;
+}
+
+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))
+ return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);
+ else
+ return TX_4X4;
+}
+// TBD? Functions to read and write segment data with range / validity checking
--- /dev/null
+++ b/vp9/common/seg_common.h
@@ -1,0 +1,64 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "type_aliases.h"
+#include "onyxc_int.h"
+#include "vp9/common/blockd.h"
+
+#ifndef __INC_SEG_COMMON_H__
+#define __INC_SEG_COMMON_H__ 1
+
+int vp9_segfeature_active(const MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void vp9_clearall_segfeatures(MACROBLOCKD *xd);
+
+void vp9_enable_segfeature(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void vp9_disable_segfeature(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void vp9_clear_segdata(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void vp9_set_segdata(MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id,
+ int seg_data);
+
+int vp9_get_segdata(const MACROBLOCKD *xd,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);
+
+void vp9_set_segref(MACROBLOCKD *xd,
+ int segment_id,
+ MV_REFERENCE_FRAME ref_frame);
+
+int vp9_check_segref(const MACROBLOCKD *xd,
+ int segment_id,
+ MV_REFERENCE_FRAME ref_frame);
+
+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);
+
+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);
+
+#endif /* __INC_SEG_COMMON_H__ */
+
--- /dev/null
+++ b/vp9/common/setupintrarecon.c
@@ -1,0 +1,31 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
+ int i;
+
+ /* set up frame new frame for intra coded blocks */
+ vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ for (i = 0; i < ybf->y_height; i++)
+ ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
+
+ vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+
+ vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+
+}
--- /dev/null
+++ b/vp9/common/setupintrarecon.h
@@ -1,0 +1,13 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
--- /dev/null
+++ b/vp9/common/subpixel.h
@@ -1,0 +1,204 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_H
+#define SUBPIXEL_H
+
+#define prototype_subpixel_predict(sym) \
+ void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
+ unsigned char *dst, int dst_pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/subpixel_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/subpixel_arm.h"
+#endif
+
+#ifndef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);
+
+#ifndef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);
+
+#ifndef vp9_subpix_sixtap_avg16x16
+#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);
+
+#ifndef vp9_subpix_sixtap_avg8x8
+#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);
+#ifndef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);
+
+#ifndef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);
+
+#ifndef vp9_subpix_sixtap_avg4x4
+#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);
+
+#ifndef vp9_subpix_eighttap16x16
+#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);
+
+#ifndef vp9_subpix_eighttap8x8
+#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);
+
+#ifndef vp9_subpix_eighttap_avg16x16
+#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);
+
+#ifndef vp9_subpix_eighttap_avg8x8
+#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);
+
+#ifndef vp9_subpix_eighttap8x4
+#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);
+
+#ifndef vp9_subpix_eighttap4x4
+#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);
+
+#ifndef vp9_subpix_eighttap_avg4x4
+#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);
+
+#ifndef vp9_subpix_eighttap16x16_sharp
+#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);
+
+#ifndef vp9_subpix_eighttap8x8_sharp
+#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);
+
+#ifndef vp9_subpix_eighttap_avg16x16_sharp
+#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);
+
+#ifndef vp9_subpix_eighttap_avg8x8_sharp
+#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);
+
+#ifndef vp9_subpix_eighttap8x4_sharp
+#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);
+
+#ifndef vp9_subpix_eighttap4x4_sharp
+#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);
+
+#ifndef vp9_subpix_eighttap_avg4x4_sharp
+#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);
+
+#ifndef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);
+
+#ifndef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);
+
+#ifndef vp9_subpix_bilinear_avg16x16
+#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);
+
+#ifndef vp9_subpix_bilinear_avg8x8
+#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);
+
+#ifndef vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);
+
+#ifndef vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);
+
+#ifndef vp9_subpix_bilinear_avg4x4
+#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c
+#endif
+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);
+
+typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
+typedef struct {
+ vp9_subpix_fn_t eighttap16x16;
+ vp9_subpix_fn_t eighttap8x8;
+ vp9_subpix_fn_t eighttap_avg16x16;
+ vp9_subpix_fn_t eighttap_avg8x8;
+ vp9_subpix_fn_t eighttap_avg4x4;
+ vp9_subpix_fn_t eighttap8x4;
+ vp9_subpix_fn_t eighttap4x4;
+ vp9_subpix_fn_t eighttap16x16_sharp;
+ vp9_subpix_fn_t eighttap8x8_sharp;
+ vp9_subpix_fn_t eighttap_avg16x16_sharp;
+ vp9_subpix_fn_t eighttap_avg8x8_sharp;
+ vp9_subpix_fn_t eighttap_avg4x4_sharp;
+ vp9_subpix_fn_t eighttap8x4_sharp;
+ vp9_subpix_fn_t eighttap4x4_sharp;
+ vp9_subpix_fn_t sixtap16x16;
+ vp9_subpix_fn_t sixtap8x8;
+ vp9_subpix_fn_t sixtap_avg16x16;
+ vp9_subpix_fn_t sixtap_avg8x8;
+ vp9_subpix_fn_t sixtap8x4;
+ vp9_subpix_fn_t sixtap4x4;
+ vp9_subpix_fn_t sixtap_avg4x4;
+ vp9_subpix_fn_t bilinear16x16;
+ vp9_subpix_fn_t bilinear8x8;
+ vp9_subpix_fn_t bilinear_avg16x16;
+ vp9_subpix_fn_t bilinear_avg8x8;
+ vp9_subpix_fn_t bilinear8x4;
+ vp9_subpix_fn_t bilinear4x4;
+ vp9_subpix_fn_t bilinear_avg4x4;
+} vp9_subpix_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/swapyv12buffer.c
@@ -1,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "swapyv12buffer.h"
+
+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *last_frame) {
+ unsigned char *temp;
+
+ temp = last_frame->buffer_alloc;
+ last_frame->buffer_alloc = new_frame->buffer_alloc;
+ new_frame->buffer_alloc = temp;
+
+ temp = last_frame->y_buffer;
+ last_frame->y_buffer = new_frame->y_buffer;
+ new_frame->y_buffer = temp;
+
+ temp = last_frame->u_buffer;
+ last_frame->u_buffer = new_frame->u_buffer;
+ new_frame->u_buffer = temp;
+
+ temp = last_frame->v_buffer;
+ last_frame->v_buffer = new_frame->v_buffer;
+ new_frame->v_buffer = temp;
+}
--- /dev/null
+++ b/vp9/common/swapyv12buffer.h
@@ -1,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __SWAPYV12_BUFFER_H
+#define __SWAPYV12_BUFFER_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *last_frame);
+
+#endif // __SWAPYV12_BUFFER_H
--- /dev/null
+++ b/vp9/common/systemdependent.h
@@ -1,0 +1,21 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vp9_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vp9_clear_system_state()
+#endif
+
+struct VP9Common;
+void vp9_machine_specific_config(struct VP9Common *);
--- /dev/null
+++ b/vp9/common/tapify.py
@@ -1,0 +1,106 @@
+"""
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+"""
+#!/usr/bin/env python
+import sys,string,os,re,math,numpy
+scale = 2**16
+def dist(p1,p2):
+ x1,y1 = p1
+ x2,y2 = p2
+ if x1==x2 and y1==y2 :
+ return 1.0
+ return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
+
+def gettaps(p):
+ def l(b):
+ return int(math.floor(b))
+ def h(b):
+ return int(math.ceil(b))
+ def t(b,p,s):
+ return int((scale*dist(b,p)+s/2)/s)
+ r,c = p
+ ul=[l(r),l(c)]
+ ur=[l(r),h(c)]
+ ll=[h(r),l(c)]
+ lr=[h(r),h(c)]
+ sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
+ t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
+ return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
+ [ll,t(ll,p,sum)],[lr,t4]]
+
+def print_mb_taps(angle,blocksize):
+ theta = angle / 57.2957795;
+ affine = [[math.cos(theta),-math.sin(theta)],
+ [math.sin(theta),math.cos(theta)]]
+ radius = (float(blocksize)-1)/2
+ print " // angle of",angle,"degrees"
+ for y in range(blocksize) :
+ for x in range(blocksize) :
+ r,c = numpy.dot(affine,[y-radius, x-radius])
+ tps = gettaps([r+radius,c+radius])
+ for t in tps :
+ p,t = t
+ tr,tc = p
+ print " %2d, %2d, %5d, " % (tr,tc,t,),
+ print " // %2d,%2d " % (y,x)
+
+i=float(sys.argv[1])
+while i <= float(sys.argv[2]) :
+ print_mb_taps(i,float(sys.argv[4]))
+ i=i+float(sys.argv[3])
+"""
+
+taps = []
+pt=dict()
+ptr=dict()
+for y in range(16) :
+ for x in range(16) :
+ r,c = numpy.dot(affine,[y-7.5, x-7.5])
+ tps = gettaps([r+7.5,c+7.5])
+ j=0
+ for tp in tps :
+ p,i = tp
+ r,c = p
+ pt[y,x,j]= [p,i]
+ try:
+ ptr[r,j,c].append([y,x])
+ except:
+ ptr[r,j,c]=[[y,x]]
+ j = j+1
+
+for key in sorted(pt.keys()) :
+ print key,pt[key]
+
+lr = -99
+lj = -99
+lc = 0
+
+shuf=""
+mask=""
+for r,j,c in sorted(ptr.keys()) :
+ for y,x in ptr[r,j,c] :
+ if lr != r or lj != j :
+ print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
+ shuf=""
+ lc = 0
+ for i in range(lc,c-1) :
+ shuf = shuf +"0"
+ shuf = shuf + hex(x)[2]
+ lc =c
+ break
+ lr = r
+ lj = j
+# print r,j,c,ptr[r,j,c]
+
+for r,j,c in sorted(ptr.keys()) :
+ for y,x in ptr[r,j,c] :
+ print r,j,c,y,x
+ break
+"""
--- /dev/null
+++ b/vp9/common/textblit.c
@@ -1,0 +1,116 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+
+void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
+ int letter_bitmap;
+ unsigned char *output_pos = address;
+ int colpos;
+ const int font[] = {
+ 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+ 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+ 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+ 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+ 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+ 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+ 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+ 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+ 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+ };
+ colpos = 0;
+
+ while (msg[colpos] != 0) {
+ char letter = msg[colpos];
+ int fontcol, fontrow;
+
+ if (letter <= 'Z' && letter >= ' ')
+ letter_bitmap = font[letter - ' '];
+ else if (letter <= 'z' && letter >= 'a')
+ letter_bitmap = font[letter - 'a' + 'A' - ' '];
+ else
+ letter_bitmap = font[0];
+
+ for (fontcol = 6; fontcol >= 0; fontcol--)
+ for (fontrow = 0; fontrow < 5; fontrow++)
+ output_pos[fontrow * pitch + fontcol] =
+ ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+ output_pos += 7;
+ colpos++;
+ }
+}
+
+static void plot(const int x, const int y, unsigned char *image, const int pitch) {
+ image [x + y * pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {
+ int steep = abs(y1 - y0) > abs(x1 - x0);
+ int deltax, deltay;
+ int error, ystep, y, x;
+
+ if (steep) {
+ int t;
+ t = x0;
+ x0 = y0;
+ y0 = t;
+
+ t = x1;
+ x1 = y1;
+ y1 = t;
+ }
+
+ if (x0 > x1) {
+ int t;
+ t = x0;
+ x0 = x1;
+ x1 = t;
+
+ t = y0;
+ y0 = y1;
+ y1 = t;
+ }
+
+ deltax = x1 - x0;
+ deltay = abs(y1 - y0);
+ error = deltax / 2;
+
+ y = y0;
+
+ if (y0 < y1)
+ ystep = 1;
+ else
+ ystep = -1;
+
+ if (steep) {
+ for (x = x0; x <= x1; x++) {
+ plot(y, x, image, pitch);
+
+ error = error - deltay;
+ if (error < 0) {
+ y = y + ystep;
+ error = error + deltax;
+ }
+ }
+ } else {
+ for (x = x0; x <= x1; x++) {
+ plot(x, y, image, pitch);
+
+ error = error - deltay;
+ if (error < 0) {
+ y = y + ystep;
+ error = error + deltax;
+ }
+ }
+ }
+}
--- /dev/null
+++ b/vp9/common/treecoder.c
@@ -1,0 +1,138 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+
+#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+ struct vp9_token_struct *const p,
+ vp9_tree t,
+ int i,
+ int v,
+ int L
+) {
+ v += v;
+ ++L;
+
+ do {
+ const vp9_tree_index j = t[i++];
+
+ if (j <= 0) {
+ p[-j].value = v;
+ p[-j].Len = L;
+ } else
+ tree2tok(p, t, j, v, L);
+ } while (++v & 1);
+}
+
+void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {
+ tree2tok(p, t, 0, 0, 0);
+}
+
+void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,
+ int offset) {
+ tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+ int n, /* n = size of alphabet */
+ vp9_token tok [ /* n */ ],
+ vp9_tree tree,
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ]
+) {
+ const int tree_len = n - 1;
+ int t = 0;
+
+#if CONFIG_DEBUG
+ assert(tree_len);
+#endif
+
+ do {
+ branch_ct[t][0] = branch_ct[t][1] = 0;
+ } while (++t < tree_len);
+
+ t = 0;
+
+ do {
+ int L = tok[t].Len;
+ const int enc = tok[t].value;
+ const unsigned int ct = num_events[t];
+
+ vp9_tree_index i = 0;
+
+ do {
+ const int b = (enc >> --L) & 1;
+ const int j = i >> 1;
+#if CONFIG_DEBUG
+ assert(j < tree_len && 0 <= L);
+#endif
+
+ branch_ct [j] [b] += ct;
+ i = tree[ i + b];
+ } while (i > 0);
+
+#if CONFIG_DEBUG
+ assert(!L);
+#endif
+ } while (++t < n);
+
+}
+
+
+void vp9_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp9_token tok [ /* n */ ],
+ vp9_tree tree,
+ vp9_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfac,
+ int rd
+) {
+ const int tree_len = n - 1;
+ int t = 0;
+
+ branch_counts(n, tok, tree, branch_ct, num_events);
+
+ do {
+ const unsigned int *const c = branch_ct[t];
+ const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+ assert(tot < (1 << 24)); /* no overflow below */
+#endif
+
+ if (tot) {
+ const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+ probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+ } else
+ probs[t] = vp9_prob_half;
+ } while (++t < tree_len);
+}
+
+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
+ int tot_count = counts[0] + counts[1];
+ vp9_prob prob;
+ if (tot_count) {
+ prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
+ prob += !prob;
+ } else {
+ prob = 128;
+ }
+ return prob;
+}
--- /dev/null
+++ b/vp9/common/treecoder.h
@@ -1,0 +1,75 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREECODER_H
+#define __INC_TREECODER_H
+
+typedef unsigned char vp9_prob;
+
+#define vp9_prob_half ( (vp9_prob) 128)
+
+typedef signed char vp9_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp9_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of vp9_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+
+
+typedef const struct vp9_token_struct {
+ int value;
+ int Len;
+} vp9_token;
+
+/* Construct encoding array from tree. */
+
+void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
+void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
+ int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+ for the associated binary encoding tree. Also writes count of branches
+ taken for each node on the tree; this facilitiates decisions as to
+ probability updates. */
+
+void vp9_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp9_token tok [ /* n */ ],
+ vp9_tree tree,
+ vp9_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfactor,
+ int Round
+);
+
+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
+
+#endif
--- /dev/null
+++ b/vp9/common/type_aliases.h
@@ -1,0 +1,120 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : type_aliases.h
+*
+* Description : Standard type aliases
+*
+****************************************************************************/
+#ifndef __INC_TYPE_ALIASES_H
+#define __INC_TYPE_ALIASES_H
+
+/****************************************************************************
+* Macros
+****************************************************************************/
+#define EXPORT
+#define IMPORT extern /* Used to declare imported data & routines */
+#define PRIVATE static /* Used to declare & define module-local data */
+#define LOCAL static /* Used to define all persistent routine-local data */
+#define STD_IN_PATH 0 /* Standard input path */
+#define STD_OUT_PATH 1 /* Standard output path */
+#define STD_ERR_PATH 2 /* Standard error path */
+#define STD_IN_FILE stdin /* Standard input file pointer */
+#define STD_OUT_FILE stdout /* Standard output file pointer */
+#define STD_ERR_FILE stderr /* Standard error file pointer */
+#define max_int 0x7FFFFFFF
+
+#define __export
+#define _export
+
+#define CCONV
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL 0
+#else
+#define NULL ((void *)0)
+#endif
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+/****************************************************************************
+* Typedefs
+****************************************************************************/
+#ifndef TYPE_INT8
+#define TYPE_INT8
+typedef signed char INT8;
+#endif
+
+#ifndef TYPE_INT16
+/*#define TYPE_INT16*/
+typedef signed short INT16;
+#endif
+
+#ifndef TYPE_INT32
+/*#define TYPE_INT32*/
+typedef signed int INT32;
+#endif
+
+#ifndef TYPE_UINT8
+/*#define TYPE_UINT8*/
+typedef unsigned char UINT8;
+#endif
+
+#ifndef TYPE_UINT32
+/*#define TYPE_UINT32*/
+typedef unsigned int UINT32;
+#endif
+
+#ifndef TYPE_UINT16
+/*#define TYPE_UINT16*/
+typedef unsigned short UINT16;
+#endif
+
+#ifndef TYPE_BOOL
+/*#define TYPE_BOOL*/
+typedef int BOOL;
+#endif
+
+typedef unsigned char BOOLEAN;
+
+#ifdef _MSC_VER
+typedef __int64 INT64;
+#ifndef INT64_MAX
+#define INT64_MAX LLONG_MAX
+#endif
+#else
+
+#ifndef TYPE_INT64
+#ifdef _TMS320C6X
+/* for now we only have 40bits */
+typedef long INT64;
+#else
+typedef long long INT64;
+#endif
+#endif
+
+#endif
+
+/* Floating point */
+typedef double FLOAT64;
+typedef float FLOAT32;
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/filter_sse2.c
@@ -1,0 +1,289 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <emmintrin.h> // SSE2
+#include "vp9/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+// just a quick partial snapshot so that other can already use some
+// speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+// filtering.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+// of positive above 128), or have higher precision filter
+// coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, src_ptr, offset) \
+ { \
+ /* Do shifted load to achieve require shuffles through unpacking */ \
+ const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
+ const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
+ const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
+ const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
+ const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
+ const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
+ const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
+ const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
+ /* Shit by 4 bytes through suffle to get additional shifted loads */ \
+ const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
+ const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
+ const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
+ const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
+ /* multiply accumulate them */ \
+ const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
+ const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
+ const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
+ const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
+ const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
+ const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
+ mad_all = _mm_add_epi32(mad_all, rounding); \
+ result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
+ }
+
+void vp9_filter_block2d_4x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ __m128i intermediateA, intermediateB, intermediateC;
+
+ const int kInterp_Extend = 4;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+
+ // check alignment
+ assert(0 == ((long)HFilter_aligned16)%16);
+ assert(0 == ((long)VFilter_aligned16)%16);
+
+ {
+ __m128i transpose3_0;
+ __m128i transpose3_1;
+ __m128i transpose3_2;
+ __m128i transpose3_3;
+
+ // Horizontal pass (src -> intermediate).
+ {
+ const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+ {
+ __m128i mad_all0;
+ __m128i mad_all1;
+ __m128i mad_all2;
+ __m128i mad_all3;
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+ intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+ }
+ }
+
+ // Transpose result (intermediate -> transpose3_x)
+ {
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+ // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+ const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
+ const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
+ const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
+ const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
+ // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
+ // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
+ // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
+ const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+ const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+ const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
+ const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
+ // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
+ // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
+ // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
+ // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
+ const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
+ const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
+ const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
+ const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
+ // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
+ transpose3_0 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose2_2),
+ _MM_SHUFFLE(1, 0, 1, 0)));
+ transpose3_1 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose2_2),
+ _MM_SHUFFLE(3, 2, 3, 2)));
+ transpose3_2 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose2_3),
+ _MM_SHUFFLE(1, 0, 1, 0)));
+ transpose3_3 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose2_3),
+ _MM_SHUFFLE(3, 2, 3, 2)));
+ // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+ // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+ }
+
+ // Vertical pass (transpose3_x -> dst).
+ {
+ const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ __m128i col0, col1, col2, col3;
+ DECLARE_ALIGNED(16, unsigned char, temp[32]);
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_0);
+ DO_FOUR_PIXELS(col0, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_1);
+ DO_FOUR_PIXELS(col1, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_2);
+ DO_FOUR_PIXELS(col2, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_3);
+ DO_FOUR_PIXELS(col3, temp, 0);
+ }
+ // transpose
+ {
+ __m128i T0 = _mm_unpacklo_epi32(col0, col1);
+ __m128i T1 = _mm_unpacklo_epi32(col2, col3);
+ __m128i T2 = _mm_unpackhi_epi32(col0, col1);
+ __m128i T3 = _mm_unpackhi_epi32(col2, col3);
+ col0 = _mm_unpacklo_epi64(T0, T1);
+ col1 = _mm_unpackhi_epi64(T0, T1);
+ col2 = _mm_unpacklo_epi64(T2, T3);
+ col3 = _mm_unpackhi_epi64(T2, T3);
+ }
+ // saturate to 8 bit
+ {
+ col0 = _mm_packs_epi32(col0, col0);
+ col0 = _mm_packus_epi16(col0, col0);
+ col1 = _mm_packs_epi32(col1, col1);
+ col1 = _mm_packus_epi16(col1, col1);
+ col2 = _mm_packs_epi32 (col2, col2);
+ col2 = _mm_packus_epi16(col2, col2);
+ col3 = _mm_packs_epi32 (col3, col3);
+ col3 = _mm_packus_epi16(col3, col3);
+ }
+ // store
+ {
+ *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
+ *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
+ *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
+ *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
+ }
+ }
+ }
+}
+
+void vp9_filter_block2d_8x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int j;
+ for (j=0; j<8; j+=4) {
+ vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j, dst_stride);
+ }
+}
+
+void vp9_filter_block2d_8x8_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<8; i+=4) {
+ for (j=0; j<8; j+=4) {
+ vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
+
+void vp9_filter_block2d_16x16_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<16; i+=4) {
+ for (j=0; j<16; j+=4) {
+ vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
--- /dev/null
+++ b/vp9/common/x86/filter_sse4.c
@@ -1,0 +1,362 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <smmintrin.h> // SSE4.1
+#include "vp9/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+// just a quick partial snapshot so that other can already use some
+// speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+// filtering.
+// TODO(cd): Reduce source size by using macros instead of current code
+// duplication.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+// of positive above 128), or have higher precision filter
+// coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
+ 0x00, 0x01,
+ 0x01, 0x02,
+ 0x02, 0x03,
+ 0x03, 0x04,
+ 0x02, 0x03,
+ 0x03, 0x04,
+ 0x04, 0x05,
+ 0x05, 0x06,
+};
+DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
+ 0x04, 0x05,
+ 0x05, 0x06,
+ 0x06, 0x07,
+ 0x07, 0x08,
+ 0x06, 0x07,
+ 0x07, 0x08,
+ 0x08, 0x09,
+ 0x09, 0x0A,
+};
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+ VP9_FILTER_WEIGHT >> 1,
+};
+DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
+ 0, 4, 8, 12,
+ 1, 5, 9, 13,
+ 2, 6, 10, 14,
+ 3, 7, 11, 15
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, offset) \
+ { \
+ /*load pixels*/ \
+ __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \
+ /* extract the ones used for first column */ \
+ __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \
+ __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \
+ __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \
+ __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \
+ __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \
+ __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \
+ /* multiply accumulate them */ \
+ __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
+ __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
+ __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
+ __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
+ __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
+ __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
+ mad_all = _mm_add_epi32(mad_all, rounding); \
+ result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
+ }
+
+void vp9_filter_block2d_4x4_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ __m128i intermediateA, intermediateB, intermediateC;
+
+ const int kInterp_Extend = 4;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
+ const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
+ const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+ const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
+
+ // check alignment
+ assert(0 == ((long)HFilter_aligned16)%16);
+ assert(0 == ((long)VFilter_aligned16)%16);
+
+ {
+ __m128i transpose3_0;
+ __m128i transpose3_1;
+ __m128i transpose3_2;
+ __m128i transpose3_3;
+
+ // Horizontal pass (src -> intermediate).
+ {
+ const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+ {
+ __m128i mad_all0;
+ __m128i mad_all1;
+ __m128i mad_all2;
+ __m128i mad_all3;
+ DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, 2*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+ intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+ }
+ }
+
+ // Transpose result (intermediate -> transpose3_x)
+ {
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+ // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+ const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
+ const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
+ const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
+ const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
+ const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ transpose3_0 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose1_2),
+ _MM_SHUFFLE(0, 0, 1, 0)));
+ transpose3_1 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose1_2),
+ _MM_SHUFFLE(1, 1, 3, 2)));
+ transpose3_2 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose1_2),
+ _MM_SHUFFLE(2, 2, 1, 0)));
+ transpose3_3 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose1_2),
+ _MM_SHUFFLE(3, 3, 3, 2)));
+ // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+ // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+ }
+
+ // Vertical pass (transpose3_x -> dst).
+ {
+ const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ __m128i col0, col1, col2, col3;
+ {
+ //load pixels
+ __m128i src = transpose3_0;
+ // extract the ones used for first column
+ __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+ __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+ __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+ __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+ __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+ __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+ // multiply accumulate them
+ __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+ __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+ __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+ __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+ __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+ __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+ mad_all = _mm_add_epi32(mad_all, rounding);
+ mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+ mad_all = _mm_packs_epi32(mad_all, mad_all);
+ col0 = _mm_packus_epi16(mad_all, mad_all);
+ }
+ {
+ //load pixels
+ __m128i src = transpose3_1;
+ // extract the ones used for first column
+ __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+ __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+ __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+ __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+ __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+ __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+ // multiply accumulate them
+ __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+ __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+ __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+ __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+ __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+ __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+ mad_all = _mm_add_epi32(mad_all, rounding);
+ mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+ mad_all = _mm_packs_epi32(mad_all, mad_all);
+ col1 = _mm_packus_epi16(mad_all, mad_all);
+ }
+ {
+ //load pixels
+ __m128i src = transpose3_2;
+ // extract the ones used for first column
+ __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+ __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+ __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+ __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+ __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+ __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+ // multiply accumulate them
+ __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+ __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+ __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+ __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+ __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+ __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+ mad_all = _mm_add_epi32(mad_all, rounding);
+ mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+ mad_all = _mm_packs_epi32(mad_all, mad_all);
+ col2 = _mm_packus_epi16(mad_all, mad_all);
+ }
+ {
+ //load pixels
+ __m128i src = transpose3_3;
+ // extract the ones used for first column
+ __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
+ __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
+ __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
+ __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
+ __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
+ __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
+ // multiply accumulate them
+ __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
+ __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
+ __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
+ __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
+ __m128i mad0123 = _mm_add_epi32(mad01, mad23);
+ __m128i mad4567 = _mm_add_epi32(mad45, mad67);
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
+ mad_all = _mm_add_epi32(mad_all, rounding);
+ mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
+ mad_all = _mm_packs_epi32(mad_all, mad_all);
+ col3 = _mm_packus_epi16(mad_all, mad_all);
+ }
+ {
+ __m128i col01 = _mm_unpacklo_epi8(col0, col1);
+ __m128i col23 = _mm_unpacklo_epi8(col2, col3);
+ __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
+ //TODO(cd): look into Ronald's comment:
+ // Future suggestion: I believe here, too, you can merge the
+ // packs_epi32() and pacus_epi16() for the 4 cols above, so that
+ // you get the data in a single register, and then use pshufb
+ // (shuffle_epi8()) instead of the unpacks here. Should be
+ // 2+3+2 instructions faster.
+ *((unsigned int *)&dst_ptr[dst_stride * 0]) =
+ _mm_extract_epi32(col0123, 0);
+ *((unsigned int *)&dst_ptr[dst_stride * 1]) =
+ _mm_extract_epi32(col0123, 1);
+ *((unsigned int *)&dst_ptr[dst_stride * 2]) =
+ _mm_extract_epi32(col0123, 2);
+ *((unsigned int *)&dst_ptr[dst_stride * 3]) =
+ _mm_extract_epi32(col0123, 3);
+ }
+ }
+ }
+}
+
+void vp9_filter_block2d_8x4_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int j;
+ for (j=0; j<8; j+=4) {
+ vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j, dst_stride);
+ }
+}
+
+void vp9_filter_block2d_8x8_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<8; i+=4) {
+ for (j=0; j<8; j+=4) {
+ vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
+
+void vp9_filter_block2d_16x16_8_sse4_1
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<16; i+=4) {
+ for (j=0; j<16; j+=4) {
+ vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
--- /dev/null
+++ b/vp9/common/x86/idct_x86.h
@@ -1,0 +1,64 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_X86_H
+#define IDCT_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
+extern prototype_idct(vp9_short_idct4x4llm_mmx);
+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
+
+extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
+extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_idct_idct1
+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
+
+#undef vp9_idct_idct16
+#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
+
+#undef vp9_idct_idct1_scalar_add
+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
+
+#undef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
+
+#undef vp9_idct_iwalsh1
+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx
+
+#endif
+#endif
+
+#if HAVE_SSE2
+
+extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_idct_iwalsh16
+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2
+
+#endif
+
+#endif
+
+
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/idctllm_mmx.asm
@@ -1,0 +1,241 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2: times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16: times 4 dw 16
+
+SECTION .text
+
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+INIT_MMX
+
+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
+ mova m0, [inpq +0]
+ mova m1, [inpq +8]
+
+ mova m2, [inpq+16]
+ mova m3, [inpq+24]
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2] ;
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1] ;
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2] ;
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1] ;
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova [outq], m0
+
+ mova [outq+r2], m1
+ mova [outq+pitq*2], m2
+
+ add outq, pitq
+ mova [outq+pitq*2], m5
+ RET
+
+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
+cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
+ movh m0, [inpq]
+ paddw m0, [pw_16]
+ psraw m0, 5
+ punpcklwd m0, m0
+ punpckldq m0, m0
+
+ mova [outq], m0
+ mova [outq+pitq], m0
+
+ mova [outq+pitq*2], m0
+ add r1, r2
+
+ mova [outq+pitq*2], m0
+ RET
+
+
+;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+%else
+ mov strideq, stridem
+%endif
+ pxor m0, m0
+
+ movh m5, in_dcq ; dc
+ paddw m5, [pw_16]
+
+ psraw m5, 5
+
+ punpcklwd m5, m5
+ punpckldq m5, m5
+
+ movh m1, [predq]
+ punpcklbw m1, m0
+ paddsw m1, m5
+ packuswb m1, m0 ; pack and unpack to saturate
+ movh [dstq], m1
+
+ movh m2, [predq+pitq]
+ punpcklbw m2, m0
+ paddsw m2, m5
+ packuswb m2, m0 ; pack and unpack to saturate
+ movh [dstq+strideq], m2
+
+ movh m3, [predq+2*pitq]
+ punpcklbw m3, m0
+ paddsw m3, m5
+ packuswb m3, m0 ; pack and unpack to saturate
+ movh [dstq+2*strideq], m3
+
+ add dstq, strideq
+ add predq, pitq
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m0
+ paddsw m4, m5
+ packuswb m4, m0 ; pack and unpack to saturate
+ movh [dstq+2*strideq], m4
+ RET
+
--- /dev/null
+++ b/vp9/common/x86/idctllm_sse2.asm
@@ -1,0 +1,712 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_idct_dequant_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *pre - 2
+; unsigned char *dst - 3
+; int dst_stride - 4
+; int blk_stride - 5
+; )
+
+global sym(vp9_idct_dequant_0_2x_sse2)
+sym(vp9_idct_dequant_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ ; end prolog
+
+ mov rdx, arg(1) ; dequant
+ mov rax, arg(0) ; qcoeff
+
+ movd xmm4, [rax]
+ movd xmm5, [rdx]
+
+ pinsrw xmm4, [rax+32], 4
+ pinsrw xmm5, [rdx], 4
+
+ pmullw xmm4, xmm5
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; clear coeffs
+ movd [rax], xmm5
+ movd [rax+32], xmm5
+;pshufb
+ pshuflw xmm4, xmm4, 00000000b
+ pshufhw xmm4, xmm4, 00000000b
+
+ mov rax, arg(2) ; pre
+ paddw xmm4, [GLOBAL(fours)]
+
+ movsxd rcx, dword ptr arg(5) ; blk_stride
+ psraw xmm4, 3
+
+ movq xmm0, [rax]
+ movq xmm1, [rax+rcx]
+ movq xmm2, [rax+2*rcx]
+ lea rcx, [3*rcx]
+ movq xmm3, [rax+rcx]
+
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ mov rax, arg(3) ; dst
+ movsxd rdx, dword ptr arg(4) ; dst_stride
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rax], xmm0
+ movq [rax + rdx], xmm1
+
+ lea rax, [rax + 2*rdx]
+
+ movq [rax], xmm2
+ movq [rax + rdx], xmm3
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_idct_dequant_full_2x_sse2)
+sym(vp9_idct_dequant_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rsi, arg(2) ; pre
+ mov rdi, arg(3) ; dst
+ movsxd rcx, dword ptr arg(5) ; blk_stride
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+ mov rdx, arg(1) ; dequant
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rsi]
+ movq xmm5, [rsi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rsi+2*rcx]
+ lea rcx, [3*rcx]
+ movq xmm5, [rsi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(4) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_idct_dequant_dc_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *pre - 2
+; unsigned char *dst - 3
+; int dst_stride - 4
+; short *dc - 5
+; )
+global sym(vp9_idct_dequant_dc_0_2x_sse2)
+sym(vp9_idct_dequant_dc_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rsi, arg(2) ; pre
+ mov rdi, arg(3) ; dst
+ mov rdx, arg(5) ; dc
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; load up 2 dc words here == 2*16 = doubleword
+ movd xmm4, [rdx]
+
+ ; Load up predict blocks
+ movq xmm0, [rsi]
+ movq xmm1, [rsi+16]
+ movq xmm2, [rsi+32]
+ movq xmm3, [rsi+48]
+
+ ; Duplicate and expand dc across
+ punpcklwd xmm4, xmm4
+ punpckldq xmm4, xmm4
+
+ ; Rounding to dequant and downshift
+ paddw xmm4, [GLOBAL(fours)]
+ psraw xmm4, 3
+
+ ; Predict buffer needs to be expanded from bytes to words
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(4) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_idct_dequant_dc_full_2x_sse2)
+sym(vp9_idct_dequant_dc_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rsi, arg(2) ; pre
+ mov rdi, arg(3) ; dst
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+ mov rdx, arg(1) ; dequant
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; DC component
+ mov rdx, arg(5)
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; insert DC component
+ pinsrw xmm0, [rdx], 0
+ pinsrw xmm0, [rdx+2], 4
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rsi]
+ movq xmm5, [rsi+16]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rsi+32]
+ movq xmm5, [rsi+48]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(4) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+fours:
+ times 8 dw 0x0004
+align 16
+x_s1sqr2:
+ times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 8 dw 0x4E7B
--- /dev/null
+++ b/vp9/common/x86/iwalsh_mmx.asm
@@ -1,0 +1,173 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_1_mmx)
+sym(vp9_short_inv_walsh4x4_1_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rax, 3
+
+ mov rdi, arg(1)
+ add rax, [rsi] ;input[0] + 3
+
+ movd mm0, eax
+
+ punpcklwd mm0, mm0 ;x x val val
+
+ punpckldq mm0, mm0 ;val val val val
+
+ psraw mm0, 3 ;(input[0] + 3) >> 3
+
+ movq [rdi + 0], mm0
+ movq [rdi + 8], mm0
+ movq [rdi + 16], mm0
+ movq [rdi + 24], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_mmx)
+sym(vp9_short_inv_walsh4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, 3
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+ shl rax, 16
+
+ movq mm0, [rsi + 0] ;ip[0]
+ movq mm1, [rsi + 8] ;ip[4]
+ or rax, 3 ;00030003h
+
+ movq mm2, [rsi + 16] ;ip[8]
+ movq mm3, [rsi + 24] ;ip[12]
+
+ movq mm7, rax
+ movq mm4, mm0
+
+ punpcklwd mm7, mm7 ;0003000300030003h
+ movq mm5, mm1
+
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm4 ;temp al
+
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
+
+ ; 03 02 01 00
+ ; 13 12 11 10
+ ; 23 22 21 20
+ ; 33 32 31 30
+
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
+
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
+
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
+
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+ movq mm1, mm0
+ movq mm5, mm4
+
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+;~~~~~~~~~~~~~~~~~~~~~
+ movq mm3, mm1 ; 03 02 01 00
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
+
+ movq mm4, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm4, mm5 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
+
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
+
+ punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
+
+ paddw mm0, mm7
+ paddw mm1, mm7
+ paddw mm2, mm7
+ paddw mm3, mm7
+
+ psraw mm0, 3
+ psraw mm1, 3
+ psraw mm2, 3
+ psraw mm3, 3
+
+ movq [rdi + 0], mm0
+ movq [rdi + 8], mm1
+ movq [rdi + 16], mm2
+ movq [rdi + 24], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
--- /dev/null
+++ b/vp9/common/x86/iwalsh_sse2.asm
@@ -1,0 +1,119 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp9_short_inv_walsh4x4_sse2)
+sym(vp9_short_inv_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+ mov rax, 3
+
+ movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
+
+ shl rax, 16
+ or rax, 3 ;00030003h
+
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
+
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm3 ;d1 a1
+ punpckhqdq xmm4, xmm3 ;c1 b1
+ movd xmm6, eax
+
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+;;;temp output
+;; movdqu [rdi + 0], xmm4
+;; movdqu [rdi + 16], xmm3
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ paddw xmm5, xmm6
+ paddw xmm1, xmm6
+
+ psraw xmm5, 3
+ psraw xmm1, 3
+
+ movdqa [rdi + 0], xmm5
+ movdqa [rdi + 16], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
--- /dev/null
+++ b/vp9/common/x86/loopfilter_mmx.asm
@@ -1,0 +1,969 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp9_loop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp9_loop_filter_horizontal_edge_mmx)
+sym(vp9_loop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_h:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7 ;
+
+
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, [rsi] ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor mm0, mm0 ;
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+ psraw mm5, 11
+ packsswb mm0, mm5
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ movq mm6, [rsi+2*rax] ; p1
+ pxor mm6, [GLOBAL(t80)] ; reoffset
+ paddsb mm6, mm4 ; p1+= p1 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+2*rax], mm6 ; write back
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ movq [rdi], mm7 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .next8_h
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp9_loop_filter_vertical_edge_mmx)
+sym(vp9_loop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 64 ; reserve 64 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_v:
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+
+ ;transpose
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+
+ punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
+ punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+ psubusb mm5, mm7 ; q2-q3
+
+ psubusb mm7, mm6 ; q3-q2
+ por mm7, mm5; ; mm7=abs (q3-q2)
+
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+
+ psubusb mm3, mm6 ; q1-q2
+ psubusb mm6, mm5 ; q2-q1
+
+ por mm6, mm3 ; mm6=abs(q2-q1)
+ lea rdx, srct
+
+ movq [rdx+24], mm5 ; save q1
+ movq [rdx+16], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+8], mm3 ; save p0
+
+ movq [rdx], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4
+
+ psubusb mm0, mm4
+ psubusb mm1, mm4
+
+ psubusb mm6, mm4
+ por mm7, mm6
+
+ por mm0, mm1
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+16] ; mm5=q0
+ movq mm7, [rdx+24] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ movq mm2, [rdx] ; p1
+ movq mm7, [rdx+24] ; q1
+
+ movq mm6, [rdx+8] ; p0
+ movq mm0, [rdx+16] ; q0
+
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+ pxor mm0, mm0 ;
+
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+
+ psraw mm5, 11
+ packsswb mm0, mm5
+
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+ ; mm6=p0 ;
+ movq mm1, [rdx] ; p1
+ pxor mm1, [GLOBAL(t80)] ; reoffset
+
+ paddsb mm1, mm4 ; p1+= p1 add
+ pxor mm1, [GLOBAL(t80)] ; unoffset
+ ; mm6 = p0 mm1 = p1
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; mm3 = q0
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ ; mm7 = q1
+
+ ; tranpose and write back
+ ; mm1 = 72 62 52 42 32 22 12 02
+ ; mm6 = 73 63 53 43 33 23 13 03
+ ; mm3 = 74 64 54 44 34 24 14 04
+ ; mm7 = 75 65 55 45 35 25 15 05
+
+ movq mm2, mm1 ; 72 62 52 42 32 22 12 02
+ punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
+
+ movq mm4, mm3 ; 74 64 54 44 34 24 14 04
+ punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
+
+ punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
+ punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
+
+ movq mm6, mm2 ; 33 32 23 22 13 12 03 02
+ punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
+
+ punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
+ movq mm5, mm1 ; 73 72 63 62 53 52 43 42
+
+ punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
+ punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
+
+
+ ; mm2 = 15 14 13 12 05 04 03 02
+ ; mm6 = 35 34 33 32 25 24 23 22
+ ; mm5 = 55 54 53 52 45 44 43 42
+ ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+ movd [rsi+rax*4+2], mm2
+ psrlq mm2, 32
+
+ movd [rdi+rax*4+2], mm2
+ movd [rsi+rax*2+2], mm6
+
+ psrlq mm6, 32
+ movd [rsi+rax+2],mm6
+
+ movd [rsi+2], mm1
+ psrlq mm1, 32
+
+ movd [rdi+2], mm1
+ neg rax
+
+ movd [rdi+rax+2],mm5
+ psrlq mm5, 32
+
+ movd [rdi+rax*2+2], mm5
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+ jnz .next8_v
+
+ add rsp, 64
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_simple_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp9_loop_filter_simple_horizontal_edge_mmx)
+sym(vp9_loop_filter_simple_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ mov rcx, 2 ; count
+.nexts8_h:
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm3, [rdx] ;
+
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+ neg rax
+
+ ; calculate mask
+ movq mm1, [rsi+2*rax] ; p1
+ movq mm0, [rdi] ; q1
+ movq mm2, mm1
+ movq mm7, mm0
+ movq mm4, mm0
+ psubusb mm0, mm1 ; q1-=p1
+ psubusb mm1, mm4 ; p1-=q1
+ por mm1, mm0 ; abs(p1-q1)
+ pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm1, 1 ; abs(p1-q1)/2
+
+ movq mm5, [rsi+rax] ; p0
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ movq mm6, mm5 ; p0
+ psubusb mm5, mm4 ; p0-=q0
+ psubusb mm4, mm6 ; q0-=p0
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm3, mm3
+ pcmpeqb mm5, mm3
+
+ ; start work on filters
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand mm5, mm2 ; mask filter values we don't care about
+
+ ; do + 4 side
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ movq mm1, mm5 ; get a copy of filters
+ psraw mm1, 11 ; arithmetic shift right 11
+ psllw mm1, 8 ; shift left 8 to put it back
+
+ por mm0, mm1 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .nexts8_h
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_simple_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp9_loop_filter_simple_vertical_edge_mmx)
+sym(vp9_loop_filter_simple_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4- 2]; ;
+ mov rcx, 2 ; count
+.nexts8_v:
+
+ lea rdi, [rsi + rax];
+ movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
+
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+
+ movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
+ movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
+
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+ movq mm5, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
+ punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
+
+ neg rax
+
+ movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
+
+ punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
+ movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
+
+ movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
+ punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
+
+ movq mm2, mm0 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 13 03 12 02 11 01 10 00
+
+ punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
+ movq mm3, mm2 ; 33 23 13 03 32 22 12 02
+
+ punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
+ punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
+
+ punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
+
+
+ ; calculate mask
+ movq mm6, mm0 ; p1
+ movq mm7, mm3 ; q1
+ psubusb mm7, mm6 ; q1-=p1
+ psubusb mm6, mm3 ; p1-=q1
+ por mm6, mm7 ; abs(p1-q1)
+ pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm6, 1 ; abs(p1-q1)/2
+
+ movq mm5, mm1 ; p0
+ movq mm4, mm2 ; q0
+
+ psubusb mm5, mm2 ; p0-=q0
+ psubusb mm4, mm1 ; q0-=p0
+
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx]
+
+ psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm7, mm7
+ pcmpeqb mm5, mm7 ; mm5 = mask
+
+ ; start work on filters
+ movq t0, mm0
+ movq t1, mm3
+
+ pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm0, mm3 ; p1 - q1
+ movq mm6, mm1 ; p0
+
+ movq mm7, mm2 ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm7 ; offseted ; q0
+
+ psubsb mm7, mm6 ; q0 - p0
+ paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
+
+ paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
+
+ pand mm5, mm0 ; mask filter values we don't care about
+
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ movq mm7, mm5 ; get a copy of filters
+ psraw mm7, 11 ; arithmetic shift right 11
+ psllw mm7, 8 ; shift left 8 to put it back
+
+ por mm0, mm7 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0sz add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+
+ movq mm0, t0
+ movq mm4, t1
+
+ ; mm0 = 70 60 50 40 30 20 10 00
+ ; mm6 = 71 61 51 41 31 21 11 01
+ ; mm3 = 72 62 52 42 32 22 12 02
+ ; mm4 = 73 63 53 43 33 23 13 03
+ ; transpose back to write out
+
+ movq mm1, mm0 ;
+ punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
+
+ punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
+ movq mm2, mm3 ;
+
+ punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
+ movq mm5, mm1 ; 71 70 61 60 51 50 41 40
+
+ punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
+ movq mm6, mm0 ; 31 30 21 20 11 10 01 00
+
+ punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
+ punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
+
+ movd [rsi+rax*4], mm0 ; write 03 02 01 00
+ punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
+
+ psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
+ punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
+
+ movd [rdi+rax*4], mm0 ; write 13 12 11 10
+ movd [rsi+rax*2], mm6 ; write 23 22 21 20
+
+ psrlq mm6, 32 ; 33 32 31 30
+ movd [rsi], mm1 ; write 43 42 41 40
+
+ movd [rsi + rax], mm6 ; write 33 32 31 30
+ neg rax
+
+ movd [rsi + rax*2], mm5 ; write 63 62 61 60
+ psrlq mm1, 32 ; 53 52 51 50
+
+ movd [rdi], mm1 ; write out 53 52 51 50
+ psrlq mm5, 32 ; 73 72 71 70
+
+ movd [rdi + rax*2], mm5 ; write 73 72 71 70
+
+ lea rsi, [rsi+rax*8] ; next 8
+
+ dec rcx
+ jnz .nexts8_v
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+; int y_stride,
+; loop_filter_info *lfi)
+;{
+;
+;
+; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+ times 8 db 0xfe
+align 16
+t80:
+ times 8 db 0x80
+align 16
+t1s:
+ times 8 db 0x01
+align 16
+t3:
+ times 8 db 0x03
+align 16
+t4:
+ times 8 db 0x04
+align 16
+ones:
+ times 4 dw 0x0001
+align 16
+s27:
+ times 4 dw 0x1b00
+align 16
+s18:
+ times 4 dw 0x1200
+align 16
+s9:
+ times 4 dw 0x0900
+align 16
+s63:
+ times 4 dw 0x003f
--- /dev/null
+++ b/vp9/common/x86/loopfilter_sse2.asm
@@ -1,0 +1,1238 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+ movdqa xmm4, [rsi+rax] ; q1
+ movdqa xmm5, [rsi] ; q0
+ neg rax ; negate pitch to deal with above border
+%else
+ movlps xmm2, [rsi + rcx*2] ; q3
+ movlps xmm1, [rsi + rcx] ; q2
+ movlps xmm4, [rsi] ; q1
+ movlps xmm5, [rsi + rax] ; q0
+
+ movhps xmm2, [rdi + rcx*2]
+ movhps xmm1, [rdi + rcx]
+ movhps xmm4, [rdi]
+ movhps xmm5, [rdi + rax]
+
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movdqa XMMWORD PTR [rsp], xmm1 ; store q2
+ movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
+%endif
+
+ movdqa xmm6, xmm1 ; q2
+ movdqa xmm3, xmm4 ; q1
+
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+
+ por xmm4, xmm6 ; abs(q2-q1)
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ movdqa xmm0, xmm5 ; q0
+ pmaxub xmm1, xmm4
+
+ psubusb xmm5, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+
+ por xmm5, xmm3 ; abs(q0-q1)
+ movdqa t0, xmm5 ; save to t0
+
+ pmaxub xmm1, xmm5
+
+%if %1
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movlps xmm2, [rsi + rax] ; p3
+ movlps xmm4, [rsi] ; p2
+ movlps xmm6, [rsi + rcx] ; p1
+
+ movhps xmm2, [rdi + rax]
+ movhps xmm4, [rdi]
+ movhps xmm6, [rdi + rcx]
+
+ movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
+ movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ movdqa xmm3, xmm6 ; p1
+
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+
+ psubusb xmm3, xmm5 ; p1-=p2
+ pmaxub xmm1, xmm4 ; abs(p3 - p2)
+
+ psubusb xmm5, xmm6 ; p2-=p1
+ pmaxub xmm1, xmm2 ; abs(p3 - p2)
+
+ pmaxub xmm1, xmm5 ; abs(p2 - p1)
+ movdqa xmm2, xmm6 ; p1
+
+ pmaxub xmm1, xmm3 ; abs(p2 - p1)
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+ movdqa xmm3, [rdi] ; q1
+%else
+ movlps xmm4, [rsi + rcx*2] ; p0
+ movhps xmm4, [rdi + rcx*2]
+ movdqa xmm3, q1 ; q1
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm6 ; p0-=p1
+
+ psubusb xmm6, xmm5 ; p1-=p0
+
+ por xmm6, xmm4 ; abs(p1 - p0)
+ mov rdx, arg(2) ; get blimit
+
+ movdqa t1, xmm6 ; save to t1
+
+ movdqa xmm4, xmm3 ; q1
+ pmaxub xmm1, xmm6
+
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+
+ psubusb xmm1, xmm7
+ por xmm2, xmm3 ; abs(p1-q1)
+
+ movdqa xmm7, XMMWORD PTR [rdx] ; blimit
+
+ movdqa xmm3, xmm0 ; q0
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+
+ mov rdx, arg(4) ; hev get thresh
+
+ movdqa xmm6, xmm5 ; p0
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; p0-=q0
+
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+
+ movdqa xmm4, t0 ; hev get abs (q1 - q0)
+
+ movdqa xmm3, t1 ; get abs (p1 - p0)
+
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm2, XMMWORD PTR [rdx] ; hev
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ psubusb xmm4, xmm2 ; hev
+
+ psubusb xmm3, xmm2 ; hev
+ por xmm1, xmm5
+
+ pxor xmm7, xmm7
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb xmm4, xmm5 ; hev
+ pcmpeqb xmm3, xmm3 ; hev
+
+ pcmpeqb xmm1, xmm7 ; mask xmm1
+ pxor xmm4, xmm3 ; hev
+%endmacro
+
+%macro B_FILTER 1
+%if %1 == 0
+ movdqa xmm2, p1 ; p1
+ movdqa xmm7, q1 ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%elif %1 == 2
+ lea rdx, srct
+
+ movdqa xmm2, [rdx] ; p1
+ movdqa xmm7, [rdx+48] ; q1
+ movdqa xmm6, [rdx+16] ; p0
+ movdqa xmm0, [rdx+32] ; q0
+%endif
+
+ pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ punpckhbw xmm5, xmm2 ; axbxcxdx
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+
+ punpcklbw xmm0, xmm1 ; exfxgxhx
+ psraw xmm5, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm1, xmm1 ; axbxcxdx
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw xmm5, [GLOBAL(ones)]
+
+ paddsw xmm1, [GLOBAL(ones)]
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+%if %1 == 0
+ movdqa xmm1, p1 ; p1
+%elif %1 == 1
+ movdqa xmm1, [rsi+2*rax] ; p1
+%elif %1 == 2
+ movdqa xmm1, [rdx] ; p1
+%endif
+ pandn xmm4, xmm5 ; high edge variance additive
+ pxor xmm6, [GLOBAL(t80)] ; unoffset
+
+ pxor xmm1, [GLOBAL(t80)] ; reoffset
+ psubsb xmm3, xmm0 ; q0-= q0 add
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm3, [GLOBAL(t80)] ; unoffset
+
+ pxor xmm1, [GLOBAL(t80)] ; unoffset
+ psubsb xmm7, xmm4 ; q1-= q1 add
+
+ pxor xmm7, [GLOBAL(t80)] ; unoffset
+%if %1 == 0
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+ movq MMWORD PTR [rsi], xmm6 ; p0
+ movhps MMWORD PTR [rdi], xmm6
+ movq MMWORD PTR [rsi + rax], xmm1 ; p1
+ movhps MMWORD PTR [rdi + rax], xmm1
+ movq MMWORD PTR [rsi + rcx], xmm3 ; q0
+ movhps MMWORD PTR [rdi + rcx], xmm3
+ movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
+ movhps MMWORD PTR [rdi + rcx*2],xmm7
+%elif %1 == 1
+ movdqa [rsi+rax], xmm6 ; write back
+ movdqa [rsi+2*rax], xmm1 ; write back
+ movdqa [rsi], xmm3 ; write back
+ movdqa [rdi], xmm7 ; write back
+%endif
+
+%endmacro
+
+
+;void vp9_loop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp9_loop_filter_horizontal_edge_sse2)
+sym(vp9_loop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the result
+ B_FILTER 1
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp9_loop_filter_horizontal_edge_uv_sse2)
+sym(vp9_loop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
+ %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
+ %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
+ %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
+ %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the result
+ B_FILTER 0
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8 2
+ movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+ lea rsi, [rsi+rax*8]
+%else
+ mov rsi, arg(5) ; v_ptr
+%endif
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+%if %1
+ lea rdi, [rdi+rax*8]
+%else
+ lea rsi, [rsi - 4]
+%endif
+
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+%if %1
+ lea rdx, srct
+%else
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+%endif
+
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa t0, xmm2 ; save to free XMM2
+ movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+%if %2
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ movdqa [rdx], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+ movdqa [rdx+16], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rdx+32], xmm4 ; save 4
+ movdqa [rdx+48], xmm5 ; save 5
+ movdqa xmm1, t0 ; get
+
+ movdqa xmm2, xmm1 ;
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+%else
+ movdqa [rdx+112], xmm7 ; save 7
+
+ movdqa [rdx+96], xmm6 ; save 6
+
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ movdqa [rdx+32], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+ movdqa [rdx+48], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rdx+64], xmm4 ; save 4
+ movdqa [rdx+80], xmm5 ; save 5
+ movdqa xmm1, t0 ; get
+
+ movdqa xmm2, xmm1
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ movdqa [rdx+16], xmm1
+
+ movdqa [rdx], xmm2
+%endif
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 1
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ movdqa xmm4, xmm5 ; q1
+
+ por xmm7, xmm0 ; abs (q3-q2)
+ psubusb xmm4, xmm6 ; q1-q2
+
+ movdqa xmm0, xmm1
+ psubusb xmm6, xmm5 ; q2-q1
+
+ por xmm6, xmm4 ; abs (q2-q1)
+ psubusb xmm0, xmm2 ; p2 - p3;
+
+ psubusb xmm2, xmm1 ; p3 - p2;
+ por xmm0, xmm2 ; abs(p2-p3)
+%if %1
+ movdqa xmm2, [rdx] ; p1
+%else
+ movdqa xmm2, [rdx+32] ; p1
+%endif
+ movdqa xmm5, xmm2 ; p1
+ pmaxub xmm0, xmm7
+
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+ pmaxub xmm0, xmm6
+
+ pmaxub xmm0, xmm1
+ movdqa xmm1, xmm2 ; p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+ lea rdx, srct
+
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ movdqa t0, xmm2 ; save abs(p1-p0)
+
+ pmaxub xmm0, xmm2
+
+%if %1
+ movdqa xmm5, [rdx+32] ; q0
+ movdqa xmm7, [rdx+48] ; q1
+%else
+ movdqa xmm5, [rdx+64] ; q0
+ movdqa xmm7, [rdx+80] ; q1
+%endif
+ mov rdx, arg(3) ; limit
+
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm2, xmm7 ; q1
+
+ psubusb xmm5, xmm7 ; q0-q1
+ psubusb xmm7, xmm6 ; q1-q0
+
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ movdqa t1, xmm7 ; save abs(q1-q0)
+
+ movdqa xmm4, XMMWORD PTR [rdx]; limit
+
+ pmaxub xmm0, xmm7
+ mov rdx, arg(2) ; blimit
+
+ psubusb xmm0, xmm4
+ movdqa xmm5, xmm2 ; q1
+
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm2 ; p1-=q1
+
+ por xmm5, xmm1 ; abs(p1-q1)
+ movdqa xmm1, xmm3 ; p0
+
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psubusb xmm1, xmm6 ; p0-q0
+
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+ psubusb xmm6, xmm3 ; q0-p0
+
+ movdqa xmm4, XMMWORD PTR [rdx]; blimit
+
+ mov rdx, arg(4) ; get thresh
+
+ por xmm1, xmm6 ; abs(q0-p0)
+
+ movdqa xmm6, t0 ; get abs (q1 - q0)
+
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+
+ movdqa xmm3, t1 ; get abs (p1 - p0)
+
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
+
+ psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ por xmm1, xmm0 ; mask
+ pcmpeqb xmm6, xmm0
+
+ pxor xmm0, xmm0
+ pcmpeqb xmm4, xmm4
+
+ pcmpeqb xmm1, xmm0
+ pxor xmm4, xmm6
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+2], %1
+ psrldq %1, 4
+
+ movd [rdi+2], %1
+ psrldq %1, 4
+
+ movd [rsi+2*rax+2], %1
+ psrldq %1, 4
+
+ movd [rdi+2*rax+2], %1
+
+ movd [rsi+4*rax+2], %2
+ psrldq %2, 4
+
+ movd [rdi+4*rax+2], %2
+ psrldq %2, 4
+
+ movd [rsi+2*rcx+2], %2
+ psrldq %2, 4
+
+ movd [rdi+2*rcx+2], %2
+%endmacro
+
+
+;void vp9_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp9_loop_filter_vertical_edge_sse2)
+sym(vp9_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 1, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK 1
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+
+ lea rdx, [rax]
+ neg rdx
+
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rdx*8]
+ lea rdi, [rdi+rdx*8]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp9_loop_filter_vertical_edge_uv_sse2)
+sym(vp9_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ lea rdx, srct
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 0, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK 1
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ; u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_loop_filter_simple_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp9_loop_filter_simple_horizontal_edge_sse2)
+sym(vp9_loop_filter_simple_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ mov rdx, arg(2) ;blimit
+ movdqa xmm3, XMMWORD PTR [rdx]
+
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+ neg rax
+
+ ; calculate mask
+ movdqa xmm1, [rsi+2*rax] ; p1
+ movdqa xmm0, [rdi] ; q1
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm0
+ movdqa xmm4, xmm0
+ psubusb xmm0, xmm1 ; q1-=p1
+ psubusb xmm1, xmm4 ; p1-=q1
+ por xmm1, xmm0 ; abs(p1-q1)
+ pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm1, 1 ; abs(p1-q1)/2
+
+ movdqa xmm5, [rsi+rax] ; p0
+ movdqa xmm4, [rsi] ; q0
+ movdqa xmm0, xmm4 ; q0
+ movdqa xmm6, xmm5 ; p0
+ psubusb xmm5, xmm4 ; p0-=q0
+ psubusb xmm4, xmm6 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm3, xmm3
+ pcmpeqb xmm5, xmm3
+
+ ; start work on filters
+ pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm2 ; mask filter values we don't care about
+
+ ; do + 4 side
+ paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movdqa xmm0, xmm5 ; get a copy of filters
+ psllw xmm0, 8 ; shift left 8
+ psraw xmm0, 3 ; arithmetic shift right 11
+ psrlw xmm0, 8
+ movdqa xmm1, xmm5 ; get a copy of filters
+ psraw xmm1, 11 ; arithmetic shift right 11
+ psllw xmm1, 8 ; shift left 8 to put it back
+
+ por xmm0, xmm1 ; put the two together to get result
+
+ psubsb xmm3, xmm0 ; q0-= q0 add
+ pxor xmm3, [GLOBAL(t80)] ; unoffset
+ movdqa [rsi], xmm3 ; write back
+
+ ; now do +3 side
+ psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movdqa xmm0, xmm5 ; get a copy of filters
+ psllw xmm0, 8 ; shift left 8
+ psraw xmm0, 3 ; arithmetic shift right 11
+ psrlw xmm0, 8
+ psraw xmm5, 11 ; arithmetic shift right 11
+ psllw xmm5, 8 ; shift left 8 to put it back
+ por xmm0, xmm5 ; put the two together to get result
+
+
+ paddsb xmm6, xmm0 ; p0+= p0 add
+ pxor xmm6, [GLOBAL(t80)] ; unoffset
+ movdqa [rsi+rax], xmm6 ; write back
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_loop_filter_simple_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp9_loop_filter_simple_vertical_edge_sse2)
+sym(vp9_loop_filter_simple_vertical_edge_sse2):
+ push rbp ; save old base pointer value.
+ mov rbp, rsp ; set new base pointer value.
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx ; save callee-saved reg
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi - 2 ]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
+
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
+
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ movdqa t0, xmm0 ; save to t0
+ movdqa t1, xmm2 ; save to t1
+
+ lea rsi, [rsi + rax*8]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
+
+ movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+ punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movdqa xmm1, xmm4
+ punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+ movdqa xmm6, xmm4
+ punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+ movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ ; calculate mask
+ movdqa xmm6, xmm0 ; p1
+ movdqa xmm7, xmm3 ; q1
+ psubusb xmm7, xmm0 ; q1-=p1
+ psubusb xmm6, xmm3 ; p1-=q1
+ por xmm6, xmm7 ; abs(p1-q1)
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm6, 1 ; abs(p1-q1)/2
+
+ movdqa xmm5, xmm1 ; p0
+ movdqa xmm4, xmm2 ; q0
+ psubusb xmm5, xmm2 ; p0-=q0
+ psubusb xmm4, xmm1 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7 ; mm5 = mask
+
+ ; start work on filters
+ movdqa t0, xmm0
+ movdqa t1, xmm3
+
+ pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb xmm0, xmm3 ; p1 - q1
+ movdqa xmm6, xmm1 ; p0
+
+ movdqa xmm7, xmm2 ; q0
+ pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
+ movdqa xmm3, xmm7 ; offseted ; q0
+
+ psubsb xmm7, xmm6 ; q0 - p0
+ paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
+
+ paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
+
+ pand xmm5, xmm0 ; mask filter values we don't care about
+
+
+ paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movdqa xmm0, xmm5 ; get a copy of filters
+ psllw xmm0, 8 ; shift left 8
+
+ psraw xmm0, 3 ; arithmetic shift right 11
+ psrlw xmm0, 8
+
+ movdqa xmm7, xmm5 ; get a copy of filters
+ psraw xmm7, 11 ; arithmetic shift right 11
+
+ psllw xmm7, 8 ; shift left 8 to put it back
+ por xmm0, xmm7 ; put the two together to get result
+
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+ pxor xmm3, [GLOBAL(t80)] ; unoffset q0
+
+ ; now do +3 side
+ psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
+ movdqa xmm0, xmm5 ; get a copy of filters
+
+ psllw xmm0, 8 ; shift left 8
+ psraw xmm0, 3 ; arithmetic shift right 11
+
+ psrlw xmm0, 8
+ psraw xmm5, 11 ; arithmetic shift right 11
+
+ psllw xmm5, 8 ; shift left 8 to put it back
+ por xmm0, xmm5 ; put the two together to get result
+
+ paddsb xmm6, xmm0 ; p0+= p0 add
+ pxor xmm6, [GLOBAL(t80)] ; unoffset p0
+
+ movdqa xmm0, t0 ; p1
+ movdqa xmm4, t1 ; q1
+
+ ; transpose back to write out
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
+ movd [rsi], xmm1 ; write the second 8-line result
+ psrldq xmm1, 4
+ movd [rdi], xmm1
+ psrldq xmm1, 4
+ movd [rsi + rax*2], xmm1
+ psrldq xmm1, 4
+ movd [rdi + rax*2], xmm1
+
+ movd [rdx], xmm3
+ psrldq xmm3, 4
+ movd [rcx], xmm3
+ psrldq xmm3, 4
+ movd [rdx + rax*2], xmm3
+ psrldq xmm3, 4
+ movd [rcx + rax*2], xmm3
+
+ neg rax
+ lea rsi, [rsi + rax*8]
+ neg rax
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd [rsi], xmm0 ; write the first 8-line result
+ psrldq xmm0, 4
+ movd [rdi], xmm0
+ psrldq xmm0, 4
+ movd [rsi + rax*2], xmm0
+ psrldq xmm0, 4
+ movd [rdi + rax*2], xmm0
+
+ movd [rdx], xmm2
+ psrldq xmm2, 4
+ movd [rcx], xmm2
+ psrldq xmm2, 4
+ movd [rdx + rax*2], xmm2
+ psrldq xmm2, 4
+ movd [rcx + rax*2], xmm2
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1s:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
+align 16
+ones:
+ times 8 dw 0x0001
+align 16
+s9:
+ times 8 dw 0x0900
+align 16
+s63:
+ times 8 dw 0x003f
--- /dev/null
+++ b/vp9/common/x86/loopfilter_x86.c
@@ -1,0 +1,543 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+#include "vpx_config.h"
+#include "vp9/common/loopfilter.h"
+
+prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
+
+prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
+prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+
+}
+
+void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
+ y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_SSE2
+void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,
+ int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh,
+ int count) {
+ DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+ __m128i mask, hev, flat;
+ __m128i thresh, limit, blimit;
+ const __m128i zero = _mm_set1_epi16(0);
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+
+ thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);
+ limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);
+ blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);
+
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+ _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2),
+ _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+ _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2),
+ _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+ _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0),
+ _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+ _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0),
+ _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+ _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0),
+ _mm_subs_epu8(q0, q4)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+ {
+ const __m128i four = _mm_set1_epi16(4);
+ unsigned char *src = s;
+ int i = 0;
+ do {
+ __m128i workp_a, workp_b, workp_shft;
+ p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+ q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+
+ workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op2[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op1[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_op0[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ src += 8;
+ } while (++i < count);
+ }
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+ t80);
+ const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+ t80);
+ const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+ t80);
+ const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+ t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+
+ filt = _mm_andnot_si128(hev, filt);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ q0 = _mm_load_si128((__m128i *)flat_oq0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+
+ work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ q1 = _mm_load_si128((__m128i *)flat_oq1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q2 = _mm_load_si128((__m128i *)flat_oq2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ p0 = _mm_load_si128((__m128i *)flat_op0);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+
+ work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ p1 = _mm_load_si128((__m128i *)flat_op1);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p2 = _mm_load_si128((__m128i *)flat_op2);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+
+ if (count == 1) {
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ }
+ }
+}
+
+static __inline void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
+ x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
+ x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
+ x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
+ x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
+ x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
+ x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
+ x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ _mm_storel_pd((double *)(out + 0*out_p),
+ _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
+ _mm_storeh_pd((double *)(out + 1*out_p),
+ _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
+ _mm_storel_pd((double *)(out + 2*out_p),
+ _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
+ _mm_storeh_pd((double *)(out + 3*out_p),
+ _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ _mm_storel_pd((double *)(out + 4*out_p),
+ _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
+ _mm_storeh_pd((double *)(out + 5*out_p),
+ _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
+ _mm_storel_pd((double *)(out + 6*out_p),
+ _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
+ _mm_storeh_pd((double *)(out + 7*out_p),
+ _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
+ unsigned char *src[4];
+ unsigned char *dst[4];
+
+ src[0] = s - 5;
+ src[1] = s - 5 + 8;
+ src[2] = s - 5 + p*8;
+ src[3] = s - 5 + p*8 + 8;
+
+ dst[0] = t_dst;
+ dst[1] = t_dst + 16*8;
+ dst[2] = t_dst + 8;
+ dst[3] = t_dst + 16*8 + 8;
+
+ // 16x16->16x16 or 16x8->8x16
+ transpose(src, p, dst, 16, (1 << count));
+
+ vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
+ thresh, count);
+
+ dst[0] = s - 5;
+ dst[1] = s - 5 + p*8;
+
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ // 16x8->8x16 or 8x8->8x8
+ transpose(src, 16, dst, p, (1 << (count - 1)));
+}
+
+/* Horizontal MB filtering */
+void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 2);
+
+ /* TODO: write sse2 version with u,v interleaved */
+ if (u_ptr)
+ vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_horizontal_edge_c_sse2(
+ y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+/* Vertical MB Filtering */
+void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ /* TODO: write sse2 version with u,v interleaved */
+ if (u_ptr)
+ vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_mbloop_filter_vertical_edge_c_sse2(
+ y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
+
+/* Horizontal B Filtering */
+void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr,
+ v_ptr + 4 * uv_stride);
+}
+
+void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
+ y_stride, blimit);
+ vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
+ y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
+ unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr,
+ v_ptr + 4);
+}
+
+void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+ vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/loopfilter_x86.h
@@ -1,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LOOPFILTER_X86_H
+#define LOOPFILTER_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
+#endif
+
+#if HAVE_SSE2
+extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
+extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
+#endif
+
+#endif // LOOPFILTER_X86_H
--- /dev/null
+++ b/vp9/common/x86/mask_sse3.asm
@@ -1,0 +1,484 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void int vp8_makemask_sse3(
+; unsigned char *y,
+; unsigned char *u,
+; unsigned char *v,
+; unsigned char *ym,
+; unsigned char *uvm,
+; int yp,
+; int uvp,
+; int ys,
+; int us,
+; int vs,
+; int yt,
+; int ut,
+; int vt)
+global sym(vp8_makemask_sse3)
+sym(vp8_makemask_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 14
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;y
+ mov rdi, arg(1) ;u
+ mov rcx, arg(2) ;v
+ mov rax, arg(3) ;ym
+ movsxd rbx, dword arg(4) ;yp
+ movsxd rdx, dword arg(5) ;uvp
+
+ pxor xmm0,xmm0
+
+ ;make 16 copies of the center y value
+ movd xmm1, arg(6)
+ pshufb xmm1, xmm0
+
+ ; make 16 copies of the center u value
+ movd xmm2, arg(7)
+ pshufb xmm2, xmm0
+
+ ; make 16 copies of the center v value
+ movd xmm3, arg(8)
+ pshufb xmm3, xmm0
+ unpcklpd xmm2, xmm3
+
+ ;make 16 copies of the y tolerance
+ movd xmm3, arg(9)
+ pshufb xmm3, xmm0
+
+ ;make 16 copies of the u tolerance
+ movd xmm4, arg(10)
+ pshufb xmm4, xmm0
+
+ ;make 16 copies of the v tolerance
+ movd xmm5, arg(11)
+ pshufb xmm5, xmm0
+ unpckhpd xmm4, xmm5
+
+ mov r8,8
+
+NextPairOfRows:
+
+ ;grab the y source values
+ movdqu xmm0, [rsi]
+
+ ;compute abs difference between source and y target
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm0
+ psubusb xmm0, xmm1
+ psubusb xmm6, xmm7
+ por xmm0, xmm6
+
+ ;compute abs difference between
+ movdqa xmm6, xmm3
+ pcmpgtb xmm6, xmm0
+
+ ;grab the y source values
+ add rsi, rbx
+ movdqu xmm0, [rsi]
+
+ ;compute abs difference between source and y target
+ movdqa xmm11, xmm1
+ movdqa xmm7, xmm0
+ psubusb xmm0, xmm1
+ psubusb xmm11, xmm7
+ por xmm0, xmm11
+
+ ;compute abs difference between
+ movdqa xmm11, xmm3
+ pcmpgtb xmm11, xmm0
+
+
+ ;grab the u and v source values
+ movdqu xmm7, [rdi]
+ movdqu xmm8, [rcx]
+ unpcklpd xmm7, xmm8
+
+ ;compute abs difference between source and uv targets
+ movdqa xmm9, xmm2
+ movdqa xmm10, xmm7
+ psubusb xmm7, xmm2
+ psubusb xmm9, xmm10
+ por xmm7, xmm9
+
+ ;check whether the number is < tolerance
+ movdqa xmm0, xmm4
+ pcmpgtb xmm0, xmm7
+
+ ;double u and v masks
+ movdqa xmm8, xmm0
+ punpckhbw xmm0, xmm0
+ punpcklbw xmm8, xmm8
+
+ ;mask row 0 and output
+ pand xmm6, xmm8
+ pand xmm6, xmm0
+ movdqa [rax],xmm6
+
+ ;mask row 1 and output
+ pand xmm11, xmm8
+ pand xmm11, xmm0
+ movdqa [rax+16],xmm11
+
+
+ ; to the next row or set of rows
+ add rsi, rbx
+ add rdi, rdx
+ add rcx, rdx
+ add rax,32
+ dec r8
+ jnz NextPairOfRows
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;GROW_HORIZ (register for result, source register or mem local)
+; takes source and shifts left and ors with source
+; then shifts right and ors with source
+%macro GROW_HORIZ 2
+ movdqa %1, %2
+ movdqa xmm14, %1
+ movdqa xmm15, %1
+ pslldq xmm14, 1
+ psrldq xmm15, 1
+ por %1,xmm14
+ por %1,xmm15
+%endmacro
+;GROW_VERT (result, center row, above row, below row)
+%macro GROW_VERT 4
+ movdqa %1,%2
+ por %1,%3
+ por %1,%4
+%endmacro
+
+;GROW_NEXTLINE (new line to grow, new source, line to write)
+%macro GROW_NEXTLINE 3
+ GROW_HORIZ %1, %2
+ GROW_VERT xmm3, xmm0, xmm1, xmm2
+ movdqa %3,xmm3
+%endmacro
+
+
+;void int vp8_growmaskmb_sse3(
+; unsigned char *om,
+; unsigned char *nm,
+global sym(vp8_growmaskmb_sse3)
+sym(vp8_growmaskmb_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src
+ mov rdi, arg(1) ;rst
+
+ GROW_HORIZ xmm0, [rsi]
+ GROW_HORIZ xmm1, [rsi+16]
+ GROW_HORIZ xmm2, [rsi+32]
+
+ GROW_VERT xmm3, xmm0, xmm1, xmm2
+ por xmm0,xmm1
+ movdqa [rdi], xmm0
+ movdqa [rdi+16],xmm3
+
+ GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
+ GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
+ GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
+ GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
+ GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
+ GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
+ GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
+ GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
+ GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
+ GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
+ GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
+ GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
+ GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
+
+ por xmm0,xmm2
+ movdqa [rdi+240], xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int vp8_sad16x16_masked_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned char *mask)
+global sym(vp8_sad16x16_masked_wmt)
+sym(vp8_sad16x16_masked_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rbx, arg(4) ;mask
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ mov rcx, 16
+
+ pxor xmm3, xmm3
+
+NextSadRow:
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rdi]
+ movdqu xmm2, [rbx]
+ pand xmm0, xmm2
+ pand xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm3, xmm0
+
+ add rsi, rax
+ add rdi, rdx
+ add rbx, 16
+
+ dec rcx
+ jnz NextSadRow
+
+ movdqa xmm4 , xmm3
+ psrldq xmm4, 8
+ paddw xmm3, xmm4
+ movq rax, xmm3
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x16_unmasked_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned char *mask)
+global sym(vp8_sad16x16_unmasked_wmt)
+sym(vp8_sad16x16_unmasked_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rbx, arg(4) ;mask
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ mov rcx, 16
+
+ pxor xmm3, xmm3
+
+next_vp8_sad16x16_unmasked_wmt:
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rdi]
+ movdqu xmm2, [rbx]
+ por xmm0, xmm2
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm3, xmm0
+
+ add rsi, rax
+ add rdi, rdx
+ add rbx, 16
+
+ dec rcx
+ jnz next_vp8_sad16x16_unmasked_wmt
+
+ movdqa xmm4 , xmm3
+ psrldq xmm4, 8
+ paddw xmm3, xmm4
+ movq rax, xmm3
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_masked_predictor_wmt(
+; unsigned char *masked,
+; unsigned char *unmasked,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; unsigned char *mask)
+global sym(vp8_masked_predictor_wmt)
+sym(vp8_masked_predictor_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;ref_ptr
+
+ mov rbx, arg(5) ;mask
+ movsxd rax, dword ptr arg(2) ;src_stride
+ mov r11, arg(3) ; destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ mov rcx, 16
+
+ pxor xmm3, xmm3
+
+next_vp8_masked_predictor_wmt:
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rdi]
+ movdqu xmm2, [rbx]
+
+ pand xmm0, xmm2
+ pandn xmm2, xmm1
+ por xmm0, xmm2
+ movdqu [r11], xmm0
+
+ add r11, rdx
+ add rsi, rax
+ add rdi, rdx
+ add rbx, 16
+
+ dec rcx
+ jnz next_vp8_masked_predictor_wmt
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_masked_predictor_uv_wmt(
+; unsigned char *masked,
+; unsigned char *unmasked,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; unsigned char *mask)
+global sym(vp8_masked_predictor_uv_wmt)
+sym(vp8_masked_predictor_uv_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;ref_ptr
+
+ mov rbx, arg(5) ;mask
+ movsxd rax, dword ptr arg(2) ;src_stride
+ mov r11, arg(3) ; destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ mov rcx, 8
+
+ pxor xmm3, xmm3
+
+next_vp8_masked_predictor_uv_wmt:
+ movq xmm0, [rsi]
+ movq xmm1, [rdi]
+ movq xmm2, [rbx]
+
+ pand xmm0, xmm2
+ pandn xmm2, xmm1
+ por xmm0, xmm2
+ movq [r11], xmm0
+
+ add r11, rdx
+ add rsi, rax
+ add rdi, rax
+ add rbx, 8
+
+ dec rcx
+ jnz next_vp8_masked_predictor_uv_wmt
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_uv_from_y_mask(
+; unsigned char *ymask,
+; unsigned char *uvmask)
+global sym(vp8_uv_from_y_mask)
+sym(vp8_uv_from_y_mask):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+
+ mov rcx, 8
+
+ pxor xmm3, xmm3
+
+next_p8_uv_from_y_mask:
+ movdqu xmm0, [rsi]
+ pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
+ movq [rdi],xmm0
+ add rdi, 8
+ add rsi,32
+
+ dec rcx
+ jnz next_p8_uv_from_y_mask
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+
--- /dev/null
+++ b/vp9/common/x86/postproc_mmx.asm
@@ -1,0 +1,534 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT 7
+
+;void vp9_post_proc_down_and_across_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int rows,
+; int cols,
+; int flimit
+;)
+global sym(vp9_post_proc_down_and_across_mmx)
+sym(vp9_post_proc_down_and_across_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ ; move the global rd onto the stack, since we don't have enough registers
+ ; to do PIC addressing
+ movq mm0, [GLOBAL(rd)]
+ sub rsp, 8
+ movq [rsp], mm0
+%define RD [rsp]
+%else
+%define RD [GLOBAL(rd)]
+%endif
+
+ push rbx
+ lea rbx, [GLOBAL(Blur)]
+ movd mm2, dword ptr arg(6) ;flimit
+ punpcklwd mm2, mm2
+ punpckldq mm2, mm2
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rcx, DWORD PTR arg(4) ;rows
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+
+ xor rdx, rdx ; clear out rdx for use as loop counter
+.nextcol:
+
+ pxor mm7, mm7 ; mm7 = 00000000
+ movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
+ movq mm3, [rsi] ; mm4 = r0 p0..p7
+ punpcklbw mm3, mm0 ; mm3 = p0..p3
+ movq mm1, mm3 ; mm1 = p0..p3
+ pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
+
+ movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
+ movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
+ punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
+ pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
+ paddusw mm3, mm6 ; mm3 += mm6
+
+ ; thresholding
+ movq mm7, mm1 ; mm7 = r0 p0..p3
+ psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
+ psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
+ paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+ pcmpgtw mm7, mm2
+
+ movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
+ movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
+ punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
+ pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = r0 p0..p3
+ psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
+ psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
+ paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+
+ neg rax
+ movq mm6, [rbx ] ; kernel 0 taps
+ movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
+ punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
+ pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = r0 p0..p3
+ psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
+ psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
+ paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+ movq mm6, [rbx + 16] ; kernel 1 taps
+ movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
+ punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
+ pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = r0 p0..p3
+ psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
+ psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
+ paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+
+ paddusw mm3, RD ; mm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
+
+ pand mm1, mm7 ; mm1 select vals > thresh from source
+ pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
+ paddusw mm1, mm7 ; combination
+
+ packuswb mm1, mm0 ; pack to bytes
+
+ movd [rdi], mm1 ;
+ neg rax ; pitch is positive
+
+
+ add rsi, 4
+ add rdi, 4
+ add rdx, 4
+
+ cmp edx, dword ptr arg(5) ;cols
+ jl .nextcol
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+
+ push rax
+ xor rdx, rdx
+ mov rax, [rdi-4];
+
+.acrossnextcol:
+ pxor mm7, mm7 ; mm7 = 00000000
+ movq mm6, [rbx + 32 ] ;
+ movq mm4, [rdi+rdx] ; mm4 = p0..p7
+ movq mm3, mm4 ; mm3 = p0..p7
+ punpcklbw mm3, mm0 ; mm3 = p0..p3
+ movq mm1, mm3 ; mm1 = p0..p3
+ pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
+
+ movq mm6, [rbx + 48]
+ psrlq mm4, 8 ; mm4 = p1..p7
+ movq mm5, mm4 ; mm5 = p1..p7
+ punpcklbw mm5, mm0 ; mm5 = p1..p4
+ pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
+ paddusw mm3, mm6 ; mm3 += mm6
+
+ ; thresholding
+ movq mm7, mm1 ; mm7 = p0..p3
+ psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
+ paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
+ pcmpgtw mm7, mm2
+
+ movq mm6, [rbx + 64 ]
+ psrlq mm4, 8 ; mm4 = p2..p7
+ movq mm5, mm4 ; mm5 = p2..p7
+ punpcklbw mm5, mm0 ; mm5 = p2..p5
+ pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = p0..p3
+ psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
+ paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+
+ movq mm6, [rbx ]
+ movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = p0..p3
+ psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
+ psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
+ paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+ movq mm6, [rbx + 16]
+ psrlq mm4, 8 ; mm4 = p-1..p5
+ punpcklbw mm4, mm0 ; mm4 = p-1..p2
+ pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
+ paddusw mm3, mm6 ; mm3 += mm5
+
+ ; thresholding
+ movq mm6, mm1 ; mm6 = p0..p3
+ psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
+ psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
+ paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw mm6, mm2
+ por mm7, mm6 ; accumulate thresholds
+
+ paddusw mm3, RD ; mm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
+
+ pand mm1, mm7 ; mm1 select vals > thresh from source
+ pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
+ paddusw mm1, mm7 ; combination
+
+ packuswb mm1, mm0 ; pack to bytes
+ mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
+ movd eax, mm1
+
+ add rdx, 4
+ cmp edx, dword ptr arg(5) ;cols
+ jl .acrossnextcol;
+
+ mov DWORD PTR [rdi+rdx-4], eax
+ pop rax
+
+ ; done with this rwo
+ add rsi,rax ; next line
+ movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
+ add rdi,rax ; next destination
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef RD
+
+
+;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_mmx)
+sym(vp9_mbpost_proc_down_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 136
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp9_rv))]
+%endif
+
+ ;rows +=8;
+ add dword ptr arg(2), 8
+
+ ;for(c=0; c<cols; c+=4)
+.loop_col:
+ mov rsi, arg(0) ;s
+ pxor mm0, mm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+ neg rax ; rax = -pitch
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+
+ pxor mm5, mm5
+ pxor mm6, mm6 ;
+
+ pxor mm7, mm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movd mm1, DWORD PTR [rdi];
+ punpcklbw mm1, mm0 ;
+
+ paddw mm5, mm1 ;
+ pmullw mm1, mm1 ;
+
+ movq mm2, mm1 ;
+ punpcklwd mm1, mm0 ;
+
+ punpckhwd mm2, mm0 ;
+ paddd mm6, mm1 ;
+
+ paddd mm7, mm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
+ movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw mm1, mm0
+ punpcklbw mm2, mm0
+
+ paddw mm5, mm2
+ psubw mm5, mm1
+
+ pmullw mm2, mm2
+ movq mm4, mm2
+
+ punpcklwd mm2, mm0
+ punpckhwd mm4, mm0
+
+ paddd mm6, mm2
+ paddd mm7, mm4
+
+ pmullw mm1, mm1
+ movq mm2, mm1
+
+ punpcklwd mm1, mm0
+ psubd mm6, mm1
+
+ punpckhwd mm2, mm0
+ psubd mm7, mm2
+
+
+ movq mm3, mm6
+ pslld mm3, 4
+
+ psubd mm3, mm6
+ movq mm1, mm5
+
+ movq mm4, mm5
+ pmullw mm1, mm1
+
+ pmulhw mm4, mm4
+ movq mm2, mm1
+
+ punpcklwd mm1, mm4
+ punpckhwd mm2, mm4
+
+ movq mm4, mm7
+ pslld mm4, 4
+
+ psubd mm4, mm7
+
+ psubd mm3, mm1
+ psubd mm4, mm2
+
+ psubd mm3, flimit2
+ psubd mm4, flimit2
+
+ psrad mm3, 31
+ psrad mm4, 31
+
+ packssdw mm3, mm4
+ packsswb mm3, mm0
+
+ movd mm1, DWORD PTR [rsi+rax*8]
+
+ movq mm2, mm1
+ punpcklbw mm1, mm0
+
+ paddw mm1, mm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp9_rv))]
+ movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+ movq mm4, [sym(vp9_rv) + rcx*2]
+%endif
+ paddw mm1, mm4
+ ;paddw xmm1, eight8s
+ psraw mm1, 4
+
+ packuswb mm1, mm0
+ pand mm1, mm3
+
+ pandn mm3, mm2
+ por mm1, mm3
+
+ and rcx, 15
+ movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+ movd [rsi], mm1
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+
+ add dword arg(0), 4 ; s += 4
+ sub dword arg(3), 4 ; cols -= 4
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 136
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit2
+
+
+;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp9_plane_add_noise_mmx)
+sym(vp9_plane_add_noise_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movq mm1,[rsi+rax] ; get the source
+
+ psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb mm1, [rdx+32] ;bothclamp
+ psubusb mm1, [rdx+16] ;whiteclamp
+
+ movq mm2,[rdi+rax] ; get the noise for this line
+ paddb mm1,mm2 ; add it in
+ movq [rsi+rax],mm1 ; store the result
+
+ add rax,8 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+ times 16 dw 16
+ times 8 dw 64
+ times 16 dw 16
+ times 8 dw 0
+
+rd:
+ times 4 dw 0x40
--- /dev/null
+++ b/vp9/common/x86/postproc_sse2.asm
@@ -1,0 +1,695 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_post_proc_down_and_across_xmm
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int rows,
+; int cols,
+; int flimit
+;)
+global sym(vp9_post_proc_down_and_across_xmm)
+sym(vp9_post_proc_down_and_across_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ ALIGN_STACK 16, rax
+ ; move the global rd onto the stack, since we don't have enough registers
+ ; to do PIC addressing
+ movdqa xmm0, [GLOBAL(rd42)]
+ sub rsp, 16
+ movdqa [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [GLOBAL(rd42)]
+%endif
+
+
+ movd xmm2, dword ptr arg(6) ;flimit
+ punpcklwd xmm2, xmm2
+ punpckldq xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rcx, DWORD PTR arg(4) ;rows
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+ pxor xmm0, xmm0 ; mm0 = 00000000
+
+.nextrow:
+
+ xor rdx, rdx ; clear out rdx for use as loop counter
+.nextcol:
+ movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
+ punpcklbw xmm3, xmm0 ; mm3 = p0..p3
+ movdqa xmm1, xmm3 ; mm1 = p0..p3
+ psllw xmm3, 2 ;
+
+ movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
+ punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
+ paddusw xmm3, xmm5 ; mm3 += mm6
+
+ ; thresholding
+ movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
+ psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
+ psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
+ paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+ pcmpgtw xmm7, xmm2
+
+ movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+ punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
+ paddusw xmm3, xmm5 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
+ psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
+ psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
+ paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+
+ neg rax
+ movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
+ punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
+ paddusw xmm3, xmm5 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
+ psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
+ psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
+ paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+ movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
+ punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
+ paddusw xmm3, xmm4 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
+ psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
+ psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
+ paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+
+ paddusw xmm3, RD42 ; mm3 += round value
+ psraw xmm3, 3 ; mm3 /= 8
+
+ pand xmm1, xmm7 ; mm1 select vals > thresh from source
+ pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
+ paddusw xmm1, xmm7 ; combination
+
+ packuswb xmm1, xmm0 ; pack to bytes
+ movq QWORD PTR [rdi], xmm1 ;
+
+ neg rax ; pitch is positive
+ add rsi, 8
+ add rdi, 8
+
+ add rdx, 8
+ cmp edx, dword arg(5) ;cols
+
+ jl .nextcol
+
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movq xmm7, QWORD PTR [rdi +rdx -2]
+ movd xmm4, DWORD PTR [rdi +rdx +6]
+
+ pslldq xmm4, 8
+ por xmm4, xmm7
+
+ movdqa xmm3, xmm4
+ psrldq xmm3, 2
+ punpcklbw xmm3, xmm0 ; mm3 = p0..p3
+ movdqa xmm1, xmm3 ; mm1 = p0..p3
+ psllw xmm3, 2
+
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 3
+ punpcklbw xmm5, xmm0 ; mm5 = p1..p4
+ paddusw xmm3, xmm5 ; mm3 += mm6
+
+ ; thresholding
+ movdqa xmm7, xmm1 ; mm7 = p0..p3
+ psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
+ psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
+ paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
+ pcmpgtw xmm7, xmm2
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 4
+ punpcklbw xmm5, xmm0 ; mm5 = p2..p5
+ paddusw xmm3, xmm5 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = p0..p3
+ psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
+ psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
+ paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+
+ movdqa xmm5, xmm4 ; mm5 = p-2..p5
+ punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
+ paddusw xmm3, xmm5 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = p0..p3
+ psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
+ psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
+ paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+ psrldq xmm4, 1 ; mm4 = p-1..p5
+ punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
+ paddusw xmm3, xmm4 ; mm3 += mm5
+
+ ; thresholding
+ movdqa xmm6, xmm1 ; mm6 = p0..p3
+ psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
+ psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
+ paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
+ pcmpgtw xmm6, xmm2
+ por xmm7, xmm6 ; accumulate thresholds
+
+ paddusw xmm3, RD42 ; mm3 += round value
+ psraw xmm3, 3 ; mm3 /= 8
+
+ pand xmm1, xmm7 ; mm1 select vals > thresh from source
+ pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
+ paddusw xmm1, xmm7 ; combination
+
+ packuswb xmm1, xmm0 ; pack to bytes
+ movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
+ movdq2q mm0, xmm1
+
+ add rdx, 8
+ cmp edx, dword arg(5) ;cols
+ jl .acrossnextcol;
+
+ ; last 8 pixels
+ movq QWORD PTR [rdi+rdx-8], mm0
+
+ ; done with this rwo
+ add rsi,rax ; next line
+ mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+ add rdi,rax ; next destination
+ mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ add rsp,16
+ pop rsp
+%endif
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef RD42
+
+
+;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_xmm)
+sym(vp9_mbpost_proc_down_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 128+16
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+ mov [rsp+128+8], eax
+ mov [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp9_rv))]
+%endif
+
+ ;rows +=8;
+ add dword arg(2), 8
+
+ ;for(c=0; c<cols; c+=8)
+.loop_col:
+ mov rsi, arg(0) ; s
+ pxor xmm0, xmm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+ neg rax ; rax = -pitch
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6 ;
+
+ pxor xmm7, xmm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movq xmm1, QWORD PTR [rdi];
+ punpcklbw xmm1, xmm0 ;
+
+ paddw xmm5, xmm1 ;
+ pmullw xmm1, xmm1 ;
+
+ movdqa xmm2, xmm1 ;
+ punpcklwd xmm1, xmm0 ;
+
+ punpckhwd xmm2, xmm0 ;
+ paddd xmm6, xmm1 ;
+
+ paddd xmm7, xmm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
+ movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ paddw xmm5, xmm2
+ psubw xmm5, xmm1
+
+ pmullw xmm2, xmm2
+ movdqa xmm4, xmm2
+
+ punpcklwd xmm2, xmm0
+ punpckhwd xmm4, xmm0
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm4
+
+ pmullw xmm1, xmm1
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm0
+ psubd xmm6, xmm1
+
+ punpckhwd xmm2, xmm0
+ psubd xmm7, xmm2
+
+
+ movdqa xmm3, xmm6
+ pslld xmm3, 4
+
+ psubd xmm3, xmm6
+ movdqa xmm1, xmm5
+
+ movdqa xmm4, xmm5
+ pmullw xmm1, xmm1
+
+ pmulhw xmm4, xmm4
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm4
+ punpckhwd xmm2, xmm4
+
+ movdqa xmm4, xmm7
+ pslld xmm4, 4
+
+ psubd xmm4, xmm7
+
+ psubd xmm3, xmm1
+ psubd xmm4, xmm2
+
+ psubd xmm3, flimit4
+ psubd xmm4, flimit4
+
+ psrad xmm3, 31
+ psrad xmm4, 31
+
+ packssdw xmm3, xmm4
+ packsswb xmm3, xmm0
+
+ movq xmm1, QWORD PTR [rsi+rax*8]
+
+ movq xmm2, xmm1
+ punpcklbw xmm1, xmm0
+
+ paddw xmm1, xmm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp9_rv))]
+ movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+ movdqu xmm4, [sym(vp9_rv) + rcx*2]
+%endif
+
+ paddw xmm1, xmm4
+ ;paddw xmm1, eight8s
+ psraw xmm1, 4
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm3
+
+ pandn xmm3, xmm2
+ por xmm1, xmm3
+
+ and rcx, 15
+ movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movq mm0, [rsp + rcx*8] ;d[rcx*8]
+
+ movq [rsi], mm0
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+ add dword arg(0), 8 ; s += 8
+ sub dword arg(3), 8 ; cols -= 8
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 128+16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vp9_mbpost_proc_across_ip_xmm)
+sym(vp9_mbpost_proc_across_ip_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+ mov rdi, -8
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp9_plane_add_noise_wmt)
+sym(vp9_plane_add_noise_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movdqu xmm1,[rsi+rax] ; get the source
+
+ psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb xmm1, [rdx+32] ;bothclamp
+ psubusb xmm1, [rdx+16] ;whiteclamp
+
+ movdqu xmm2,[rdi+rax] ; get the noise for this line
+ paddb xmm1,xmm2 ; add it in
+ movdqu [rsi+rax],xmm1 ; store the result
+
+ add rax,16 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+ times 8 dw 0x04
+four8s:
+ times 4 dd 8
--- /dev/null
+++ b/vp9/common/x86/postproc_x86.h
@@ -1,0 +1,64 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_X86_H
+#define POSTPROC_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);
+extern prototype_postproc(vp9_post_proc_down_and_across_mmx);
+extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_mmx
+
+#undef vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx
+
+#undef vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);
+extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);
+extern prototype_postproc(vp9_post_proc_down_and_across_xmm);
+extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_postproc_down
+#define vp9_postproc_down vp9_mbpost_proc_down_xmm
+
+#undef vp9_postproc_across
+#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm
+
+#undef vp9_postproc_downacross
+#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm
+
+#undef vp9_postproc_addnoise
+#define vp9_postproc_addnoise vp9_plane_add_noise_wmt
+
+
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/recon_mmx.asm
@@ -1,0 +1,321 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon_b_mmx)
+sym(vp9_recon_b_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rdi, arg(2) ;d
+ mov rdx, arg(1) ;q
+ movsxd rax, dword ptr arg(3) ;stride
+ pxor mm0, mm0
+
+ movd mm1, [rsi]
+ punpcklbw mm1, mm0
+ paddsw mm1, [rdx]
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ movd [rdi], mm1
+
+ movd mm2, [rsi+16]
+ punpcklbw mm2, mm0
+ paddsw mm2, [rdx+32]
+ packuswb mm2, mm0 ; pack and unpack to saturate
+ movd [rdi+rax], mm2
+
+ movd mm3, [rsi+32]
+ punpcklbw mm3, mm0
+ paddsw mm3, [rdx+64]
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ movd [rdi+2*rax], mm3
+
+ add rdi, rax
+ movd mm4, [rsi+48]
+ punpcklbw mm4, mm0
+ paddsw mm4, [rdx+96]
+ packuswb mm4, mm0 ; pack and unpack to saturate
+ movd [rdi+2*rax], mm4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x8_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp9_copy_mem8x8_mmx)
+sym(vp9_copy_mem8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ add rsi, rax
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx*2], mm2
+
+
+ lea rdi, [rdi+rcx*2]
+ movq mm3, [rsi]
+
+ add rdi, rcx
+ movq mm4, [rsi+rax]
+
+ movq mm5, [rsi+rax*2]
+ movq [rdi], mm3
+
+ lea rsi, [rsi+rax*2]
+ movq [rdi+rcx], mm4
+
+ movq [rdi+rcx*2], mm5
+ lea rdi, [rdi+rcx*2]
+
+ movq mm0, [rsi+rax]
+ movq mm1, [rsi+rax*2]
+
+ movq [rdi+rcx], mm0
+ movq [rdi+rcx*2],mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x4_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp9_copy_mem8x4_mmx)
+sym(vp9_copy_mem8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ movq [rdi+rcx], mm1
+
+ movq [rdi+rcx*2], mm2
+ lea rdi, [rdi+rcx*2]
+
+ movq mm3, [rsi+rax]
+ movq [rdi+rcx], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem16x16_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp9_copy_mem16x16_mmx)
+sym(vp9_copy_mem16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movsxd rax, dword ptr arg(1) ;src_stride;
+
+ mov rdi, arg(2) ;dst;
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/common/x86/recon_sse2.asm
@@ -1,0 +1,688 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon2b_sse2)
+sym(vp9_recon2b_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rdi, arg(2) ;d
+ mov rdx, arg(1) ;q
+ movsxd rax, dword ptr arg(3) ;stride
+ pxor xmm0, xmm0
+
+ movq xmm1, MMWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ paddsw xmm1, XMMWORD PTR [rdx]
+ packuswb xmm1, xmm0 ; pack and unpack to saturate
+ movq MMWORD PTR [rdi], xmm1
+
+
+ movq xmm2, MMWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+ paddsw xmm2, XMMWORD PTR [rdx+16]
+ packuswb xmm2, xmm0 ; pack and unpack to saturate
+ movq MMWORD PTR [rdi+rax], xmm2
+
+
+ movq xmm3, MMWORD PTR [rsi+16]
+ punpcklbw xmm3, xmm0
+ paddsw xmm3, XMMWORD PTR [rdx+32]
+ packuswb xmm3, xmm0 ; pack and unpack to saturate
+ movq MMWORD PTR [rdi+rax*2], xmm3
+
+ add rdi, rax
+ movq xmm4, MMWORD PTR [rsi+24]
+ punpcklbw xmm4, xmm0
+ paddsw xmm4, XMMWORD PTR [rdx+48]
+ packuswb xmm4, xmm0 ; pack and unpack to saturate
+ movq MMWORD PTR [rdi+rax*2], xmm4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
+global sym(vp9_recon4b_sse2)
+sym(vp9_recon4b_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rdi, arg(2) ;d
+ mov rdx, arg(1) ;q
+ movsxd rax, dword ptr arg(3) ;stride
+ pxor xmm0, xmm0
+
+ movdqa xmm1, XMMWORD PTR [rsi]
+ movdqa xmm5, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm5, xmm0
+ paddsw xmm1, XMMWORD PTR [rdx]
+ paddsw xmm5, XMMWORD PTR [rdx+16]
+ packuswb xmm1, xmm5 ; pack and unpack to saturate
+ movdqa XMMWORD PTR [rdi], xmm1
+
+
+ movdqa xmm2, XMMWORD PTR [rsi+16]
+ movdqa xmm6, xmm2
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm6, xmm0
+ paddsw xmm2, XMMWORD PTR [rdx+32]
+ paddsw xmm6, XMMWORD PTR [rdx+48]
+ packuswb xmm2, xmm6 ; pack and unpack to saturate
+ movdqa XMMWORD PTR [rdi+rax], xmm2
+
+
+ movdqa xmm3, XMMWORD PTR [rsi+32]
+ movdqa xmm7, xmm3
+ punpcklbw xmm3, xmm0
+ punpckhbw xmm7, xmm0
+ paddsw xmm3, XMMWORD PTR [rdx+64]
+ paddsw xmm7, XMMWORD PTR [rdx+80]
+ packuswb xmm3, xmm7 ; pack and unpack to saturate
+ movdqa XMMWORD PTR [rdi+rax*2], xmm3
+
+ add rdi, rax
+ movdqa xmm4, XMMWORD PTR [rsi+48]
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm0
+ punpckhbw xmm5, xmm0
+ paddsw xmm4, XMMWORD PTR [rdx+96]
+ paddsw xmm5, XMMWORD PTR [rdx+112]
+ packuswb xmm4, xmm5 ; pack and unpack to saturate
+ movdqa XMMWORD PTR [rdi+rax*2], xmm4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem16x16_sse2(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp9_copy_mem16x16_sse2)
+sym(vp9_copy_mem16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movdqu xmm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movdqu xmm1, [rsi+rax]
+ movdqu xmm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm3, [rsi]
+
+ add rdi, rcx
+ movdqu xmm4, [rsi+rax]
+
+ movdqu xmm5, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm3
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm4
+ movdqa [rdi+rcx*2],xmm5
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm0, [rsi]
+
+ add rdi, rcx
+ movdqu xmm1, [rsi+rax]
+
+ movdqu xmm2, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+
+ movdqa [rdi+rcx*2], xmm2
+ movdqu xmm3, [rsi]
+
+ movdqu xmm4, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ add rdi, rcx
+ movdqu xmm5, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm3
+
+ add rsi, rax
+ movdqa [rdi+rcx], xmm4
+
+ movdqa [rdi+rcx*2],xmm5
+ movdqu xmm0, [rsi]
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm1, [rsi+rax]
+
+ add rdi, rcx
+ movdqu xmm2, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm0
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ movdqu xmm3, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ movdqa [rdi+rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_intra_pred_uv_dc_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp9_intra_pred_uv_dc_mmx2)
+sym(vp9_intra_pred_uv_dc_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; from left
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi+rax]
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*4]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, mm1, 0x0
+ lea edx, [edx+ecx+8]
+ sar edx, 4
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_intra_pred_uv_dctop_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp9_intra_pred_uv_dctop_mmx2)
+sym(vp9_intra_pred_uv_dctop_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; add up
+ paddw mm1, [GLOBAL(dc_4)]
+ psraw mm1, 3
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_intra_pred_uv_dcleft_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp9_intra_pred_uv_dcleft_mmx2)
+sym(vp9_intra_pred_uv_dcleft_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from left
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ dec rsi
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+4]
+
+ ; add up
+ shr edx, 3
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_intra_pred_uv_dc128_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp9_intra_pred_uv_dc128_mmx)
+sym(vp9_intra_pred_uv_dc128_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ ; end prolog
+
+ ; write out
+ movq mm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_intra_pred_uv_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+%macro vp9_intra_pred_uv_tm 1
+global sym(vp9_intra_pred_uv_tm_%1)
+sym(vp9_intra_pred_uv_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read top row
+ mov edx, 4
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ sub rsi, rax
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm2, [GLOBAL(dc_1024)]
+%endif
+ movq xmm1, [rsi]
+ punpcklbw xmm1, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm3, [rsi-1]
+ lea rsi, [rsi+rax-1]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ punpcklqdq xmm3, xmm3
+%else
+ pshufb xmm3, xmm2
+%endif
+ psubw xmm1, xmm3
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+.vp9_intra_pred_uv_tm_%1_loop:
+ movd xmm3, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm3, xmm3
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm3, xmm2
+ pshufb xmm5, xmm2
+%endif
+ paddw xmm3, xmm1
+ paddw xmm5, xmm1
+ packuswb xmm3, xmm5
+ movq [rdi ], xmm3
+ movhps[rdi+rcx], xmm3
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp9_intra_pred_uv_tm_%1_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp9_intra_pred_uv_tm sse2
+vp9_intra_pred_uv_tm ssse3
+
+;void vp9_intra_pred_uv_ve_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+global sym(vp9_intra_pred_uv_ve_mmx)
+sym(vp9_intra_pred_uv_ve_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ ; end prolog
+
+ ; read from top
+ mov rax, arg(2) ;src;
+ movsxd rdx, dword ptr arg(3) ;src_stride;
+ sub rax, rdx
+ movq mm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_intra_pred_uv_ho_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *src,
+; int src_stride,
+; )
+%macro vp9_intra_pred_uv_ho 1
+global sym(vp9_intra_pred_uv_ho_%1)
+sym(vp9_intra_pred_uv_ho_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+%ifidn %1, ssse3
+%ifndef GET_GOT_SAVE_ARG
+ push rbx
+%endif
+ GET_GOT rbx
+%endif
+ ; end prolog
+
+ ; read from left and write out
+%ifidn %1, mmx2
+ mov edx, 4
+%endif
+ mov rsi, arg(2) ;src;
+ movsxd rax, dword ptr arg(3) ;src_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+ lea rdx, [rcx*3]
+ movdqa xmm2, [GLOBAL(dc_00001111)]
+ lea rbx, [rax*3]
+%endif
+ dec rsi
+%ifidn %1, mmx2
+.vp9_intra_pred_uv_ho_%1_loop:
+ movd mm0, [rsi]
+ movd mm1, [rsi+rax]
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm0, mm0, 0x0
+ pshufw mm1, mm1, 0x0
+ movq [rdi ], mm0
+ movq [rdi+rcx], mm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp9_intra_pred_uv_ho_%1_loop
+%else
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+ lea rsi, [rsi+rax*4]
+ lea rdi, [rdi+rcx*4]
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+%endif
+
+ ; begin epilog
+%ifidn %1, ssse3
+ RESTORE_GOT
+%ifndef GET_GOT_SAVE_ARG
+ pop rbx
+%endif
+%endif
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp9_intra_pred_uv_ho mmx2
+vp9_intra_pred_uv_ho ssse3
+
+SECTION_RODATA
+dc_128:
+ times 8 db 128
+dc_4:
+ times 4 dw 4
+align 16
+dc_1024:
+ times 8 dw 0x400
+align 16
+dc_00001111:
+ times 8 db 0
+ times 8 db 1
--- /dev/null
+++ b/vp9/common/x86/recon_wrapper_sse2.c
@@ -1,0 +1,101 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/blockd.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *src, int src_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
+
+static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_stride,
+ build_intra_pred_mbuv_fn_t tm_fn,
+ build_intra_pred_mbuv_fn_t ho_fn) {
+ int mode = xd->mode_info_context->mbmi.uv_mode;
+ build_intra_pred_mbuv_fn_t fn;
+ int src_stride = xd->dst.uv_stride;
+
+ switch (mode) {
+ case V_PRED:
+ fn = vp9_intra_pred_uv_ve_mmx;
+ break;
+ case H_PRED:
+ fn = ho_fn;
+ break;
+ case TM_PRED:
+ fn = tm_fn;
+ break;
+ case DC_PRED:
+ if (xd->up_available) {
+ if (xd->left_available) {
+ fn = vp9_intra_pred_uv_dc_mmx2;
+ break;
+ } else {
+ fn = vp9_intra_pred_uv_dctop_mmx2;
+ break;
+ }
+ } else if (xd->left_available) {
+ fn = vp9_intra_pred_uv_dcleft_mmx2;
+ break;
+ } else {
+ fn = vp9_intra_pred_uv_dc128_mmx;
+ break;
+ }
+ break;
+ default:
+ return;
+ }
+
+ fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);
+ fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);
+}
+
+void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
+ build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
+ &xd->predictor[320], 8,
+ vp9_intra_pred_uv_tm_sse2,
+ vp9_intra_pred_uv_ho_mmx2);
+}
+
+void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
+ build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
+ &xd->predictor[320], 8,
+ vp9_intra_pred_uv_tm_ssse3,
+ vp9_intra_pred_uv_ho_ssse3);
+}
+
+void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
+ build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.uv_stride,
+ vp9_intra_pred_uv_tm_sse2,
+ vp9_intra_pred_uv_ho_mmx2);
+}
+
+void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
+ build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.uv_stride,
+ vp9_intra_pred_uv_tm_ssse3,
+ vp9_intra_pred_uv_ho_ssse3);
+}
--- /dev/null
+++ b/vp9/common/x86/sadmxn_x86.c
@@ -1,0 +1,92 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h> // SSE2
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+
+
+#if CONFIG_NEWBESTREFMV
+
+
+#if HAVE_SSE2
+unsigned int vp9_sad16x3_sse2(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ __m128i s0, s1, s2;
+ __m128i r0, r1, r2;
+ __m128i sad;
+
+ (void)max_sad;
+
+ s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
+ s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
+ s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
+
+ r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));
+ r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));
+ r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));
+
+ sad = _mm_sad_epu8(s0, r0);
+ sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1));
+ sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2));
+ sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
+
+ return _mm_cvtsi128_si32(sad);
+}
+
+unsigned int vp9_sad3x16_sse2(
+ const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ int r;
+ __m128i s0, s1, s2, s3;
+ __m128i r0, r1, r2, r3;
+ __m128i sad = _mm_set1_epi16(0);
+ for (r = 0; r < 16; r += 4) {
+ s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
+ s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
+ s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
+ s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
+ r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));
+ r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));
+ r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));
+ r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));
+
+ s0 = _mm_unpacklo_epi8(s0, s1);
+ r0 = _mm_unpacklo_epi8(r0, r1);
+ s2 = _mm_unpacklo_epi8(s2, s3);
+ r2 = _mm_unpacklo_epi8(r2, r3);
+ s0 = _mm_unpacklo_epi64(s0, s2);
+ r0 = _mm_unpacklo_epi64(r0, r2);
+
+ // throw out byte 3
+ s0 = _mm_slli_epi64(s0, 16);
+ r0 = _mm_slli_epi64(r0, 16);
+
+ sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
+
+ src_ptr += src_stride*4;
+ ref_ptr += ref_stride*4;
+ }
+
+ sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
+ return _mm_cvtsi128_si32(sad);
+}
+
+#endif
+
+
+#endif // CONFIG_NEWBESTREFMV
--- /dev/null
+++ b/vp9/common/x86/subpixel_8t_ssse3.asm
@@ -1,0 +1,550 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_v8_ssse3)
+sym(vp9_filter_block1d8_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.vp9_filter_block1d8_v8_ssse3_loop:
+ movq xmm0, [rsi] ;A
+ movq xmm1, [rsi + rdx] ;B
+ movq xmm2, [rsi + rdx * 2] ;C
+ movq xmm3, [rax + rdx * 2] ;D
+ movq xmm4, [rsi + rdx * 4] ;E
+ movq xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movq xmm6, [rsi + rbx] ;G
+ movq xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ paddsw xmm4, xmm6
+ paddsw xmm0, xmm4
+
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+
+ movq [rdi], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d8_v8_ssse3_loop
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3)
+sym(vp9_filter_block1d16_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.vp9_filter_block1d16_v8_ssse3_loop:
+ movq xmm0, [rsi] ;A
+ movq xmm1, [rsi + rdx] ;B
+ movq xmm2, [rsi + rdx * 2] ;C
+ movq xmm3, [rax + rdx * 2] ;D
+ movq xmm4, [rsi + rdx * 4] ;E
+ movq xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movq xmm6, [rsi + rbx] ;G
+ movq xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ paddsw xmm4, xmm6
+ paddsw xmm0, xmm4
+
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movq [rdi], xmm0
+
+ movq xmm0, [rsi + 8] ;A
+ movq xmm1, [rsi + rdx + 8] ;B
+ movq xmm2, [rsi + rdx * 2 + 8] ;C
+ movq xmm3, [rax + rdx * 2 + 8] ;D
+ movq xmm4, [rsi + rdx * 4 + 8] ;E
+ movq xmm5, [rax + rdx * 4 + 8] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+
+ movq xmm6, [rsi + rbx + 8] ;G
+ movq xmm7, [rax + rbx + 8] ;H
+ punpcklbw xmm6, xmm7 ;G H
+
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm2
+ paddsw xmm4, xmm6
+ paddsw xmm0, krd
+ paddsw xmm0, xmm4
+
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+
+ movq [rdi+8], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d16_v8_ssse3_loop
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3)
+sym(vp9_filter_block1d8_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+; movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+.filter_block1d8_h8_rowloop_ssse3:
+ movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+
+; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
+ movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
+;note: if we create a k0_k7 filter, we can save a pshufb
+; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
+ punpcklqdq xmm0, xmm3
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
+ pmaddubsw xmm0, k0k1
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf_t2t3)]
+ pmaddubsw xmm1, k2k3
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, [GLOBAL(shuf_t4t5)]
+ pmaddubsw xmm2, k4k5
+
+ pshufb xmm4, [GLOBAL(shuf_t6t7)]
+ pmaddubsw xmm4, k6k7
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm4
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ movq [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d8_h8_rowloop_ssse3
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3)
+sym(vp9_filter_block1d16_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+.filter_block1d16_h8_rowloop_ssse3:
+ movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+
+; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
+ movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
+;note: if we create a k0_k7 filter, we can save a pshufb
+; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
+ punpcklqdq xmm0, xmm3
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
+ pmaddubsw xmm0, k0k1
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf_t2t3)]
+ pmaddubsw xmm1, k2k3
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, [GLOBAL(shuf_t4t5)]
+ pmaddubsw xmm2, k4k5
+
+ pshufb xmm4, [GLOBAL(shuf_t6t7)]
+ pmaddubsw xmm4, k6k7
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+
+ movq xmm3, [rsi + 5]
+; movq xmm7, [rsi + 12]
+ movq xmm7, [rsi + 13]
+;note: same as above
+; punpcklbw xmm3, xmm7
+ punpcklqdq xmm3, xmm7
+
+ movdqa xmm1, xmm3
+ pshufb xmm3, [GLOBAL(shuf_t0t1)]
+ pmaddubsw xmm3, k0k1
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf_t2t3)]
+ pmaddubsw xmm1, k2k3
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, [GLOBAL(shuf_t4t5)]
+ pmaddubsw xmm2, k4k5
+
+ pshufb xmm4, [GLOBAL(shuf_t6t7)]
+ pmaddubsw xmm4, k6k7
+
+ paddsw xmm3, xmm1
+ paddsw xmm3, xmm2
+ paddsw xmm3, krd
+ paddsw xmm3, xmm4
+ psraw xmm3, 7
+ packuswb xmm3, xmm3
+ punpcklqdq xmm0, xmm3
+
+ lea rsi, [rsi + rax]
+ movdqa [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h8_rowloop_ssse3
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+shuf_t0t1:
+ db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+align 16
+shuf_t2t3:
+ db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+align 16
+shuf_t4t5:
+ db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+align 16
+shuf_t6t7:
+ db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- /dev/null
+++ b/vp9/common/x86/subpixel_mmx.asm
@@ -1,0 +1,727 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp9_filter_weight 128
+%define VP9_FILTER_SHIFT 7
+
+
+;void vp9_filter_block1d_h6_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp9_filter
+;)
+global sym(vp9_filter_block1d_h6_mmx)
+sym(vp9_filter_block1d_h6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp9_filter
+
+ movq mm1, [rdx + 16] ; do both the negative taps first!!!
+ movq mm2, [rdx + 32] ;
+ movq mm6, [rdx + 48] ;
+ movq mm7, [rdx + 64] ;
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+ movq mm3, [rsi-2] ; mm3 = p-2..p5
+ movq mm4, mm3 ; mm4 = p-2..p5
+ psrlq mm3, 8 ; mm3 = p-1..p5
+ punpcklbw mm3, mm0 ; mm3 = p-1..p2
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpckhbw mm4, mm0 ; mm5 = p2..p5
+ pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ movq mm4, mm5 ; mm4 = p-2..p5;
+ psrlq mm5, 16 ; mm5 = p0..p5;
+ punpcklbw mm5, mm0 ; mm5 = p0..p3
+ pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ psrlq mm4, 24 ; mm4 = p1..p5
+ punpcklbw mm4, mm0 ; mm4 = p1..p4
+ pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ ; do outer positive taps
+ movd mm4, [rsi+3]
+ punpcklbw mm4, mm0 ; mm5 = p3..p6
+ pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ punpcklbw mm3, mm0 ;
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
+ add rdi, rax;
+%else
+ movsxd r8, dword ptr arg(2) ;src_pixels_per_line
+ add rdi, rax;
+
+ add rsi, r8 ; next line
+%endif
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1dc_v6_mmx
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int output_pitch,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp9_filter
+;)
+global sym(vp9_filter_block1dc_v6_mmx)
+sym(vp9_filter_block1dc_v6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movq mm5, [GLOBAL(rd)]
+ push rbx
+ mov rbx, arg(7) ;vp9_filter
+ movq mm1, [rbx + 16] ; do both the negative taps first!!!
+ movq mm2, [rbx + 32] ;
+ movq mm6, [rbx + 48] ;
+ movq mm7, [rbx + 64] ;
+
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ sub rsi, rdx
+ sub rsi, rdx
+ movsxd rcx, DWORD PTR arg(5) ;output_height
+ movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+
+.nextrow_cv:
+ movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
+ pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
+ pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi] ; mm4 = p0..p3 = row -2
+ pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
+ pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
+ pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ paddsw mm3, mm5 ; mm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and saturate
+
+ movd [rdi],mm3 ; store the results in the destination
+ ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
+ ; recon block should be in cache this shouldn't cost much. Its obviously
+ ; avoidable!!!.
+ lea rdi, [rdi+rax] ;
+ dec rcx ; decrement count
+ jnz .nextrow_cv ; next row
+
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x8_mmx)
+sym(vp9_bilinear_predict8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = bilinear_filters_mmx[xoffset];
+ ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ shl rax, 5 ; offset * 32
+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+
+ shl rax, 5 ; offset*32
+ add rax, rcx ; VFilter
+
+ lea rcx, [rdi+rdx*8] ;
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x8:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8 ;dst_pitch
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x8
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x4_mmx)
+sym(vp9_bilinear_predict8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = bilinear_filters_mmx[xoffset];
+ ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+ shl rax, 5
+
+ mov rsi, arg(0) ;src_ptr ;
+ add rax, rcx
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x4:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP9_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp9_bilinear_predict4x4_mmx)
+sym(vp9_bilinear_predict4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = bilinear_filters_mmx[xoffset];
+ ;const short *VFilter = bilinear_filters_mmx[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
+ shl rax, 5
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;ldst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm0 ;
+
+ add rsi, rdx ; next line
+.next_row_4x4:
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+
+ movq mm5, mm7 ;
+ punpcklbw mm5, mm0 ;
+
+ pmullw mm5, [rax] ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+ movq mm7, mm3 ;
+
+ packuswb mm7, mm0 ;
+
+ pmullw mm3, [rax+16] ;
+ paddw mm3, mm5 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb mm3, mm0
+ movd [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch ;
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx ;
+ jne .next_row_4x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp9_six_tap_mmx))
+sym(vp9_six_tap_mmx):
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 128
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 0
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw 0
+
+ times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+
+ times 8 dw 0
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw 0
+
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw 0
+
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+ times 8 dw 0
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+ times 8 dw 0
+
+
+align 16
+global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
+sym(vp9_bilinear_filters_8x_mmx):
+ times 8 dw 128
+ times 8 dw 0
+
+ times 8 dw 112
+ times 8 dw 16
+
+ times 8 dw 96
+ times 8 dw 32
+
+ times 8 dw 80
+ times 8 dw 48
+
+ times 8 dw 64
+ times 8 dw 64
+
+ times 8 dw 48
+ times 8 dw 80
+
+ times 8 dw 32
+ times 8 dw 96
+
+ times 8 dw 16
+ times 8 dw 112
--- /dev/null
+++ b/vp9/common/x86/subpixel_sse2.asm
@@ -1,0 +1,1372 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp9_filter_block1d8_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp9_filter
+;)
+global sym(vp9_filter_block1d8_h6_sse2)
+sym(vp9_filter_block1d8_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp9_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp9_filter_block1d16_h6_sse2)
+sym(vp9_filter_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp9_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi+16], xmm4
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d8_v6_sse2
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp9_filter_block1d8_v6_sse2)
+sym(vp9_filter_block1d8_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp9_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp9_filter_block1d8_v6_sse2_loop:
+ movdqa xmm1, XMMWORD PTR [rsi]
+ pmullw xmm1, [rax]
+
+ movdqa xmm2, XMMWORD PTR [rsi + rdx]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
+ pmullw xmm3, [rax + 32]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
+ pmullw xmm5, [rax + 64]
+
+ add rsi, rdx
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
+
+ pmullw xmm4, [rax + 48]
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
+
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp9_filter_block1d8_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d16_v6_sse2
+;(
+; unsigned short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; const short *vp9_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp9_filter_block1d16_v6_sse2)
+sym(vp9_filter_block1d16_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp9_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp9_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
+ pmullw xmm1, [rax + 16]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm3, [rax + 64]
+ pmullw xmm4, [rax + 64]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm5, [rax + 32]
+ pmullw xmm6, [rax + 32]
+
+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1
+ movdqa xmm0, XMMWORD PTR [rsi + 16]
+ pmullw xmm7, [rax]
+ pmullw xmm0, [rax]
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm0
+
+ add rsi, rdx
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm3, [rax + 48]
+ pmullw xmm4, [rax + 48]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm5, [rax + 80]
+ pmullw xmm6, [rax + 80]
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm7
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packuswb xmm1, xmm2 ; pack and saturate
+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp9_filter_block1d16_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d8_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp9_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp9_filter_block1d8_h6_only_sse2)
+sym(vp9_filter_block1d8_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp9_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+
+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_only_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d16_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp9_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp9_filter_block1d16_h6_only_sse2)
+sym(vp9_filter_block1d16_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp9_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; lower 8 bytes
+
+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; higher 8 bytes
+
+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_filter_block1d8_v6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp9_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp9_filter_block1d8_v6_only_sse2)
+sym(vp9_filter_block1d8_v6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ mov rax, arg(5) ;vp9_filter
+
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp9_filter_block1d8_v6_only_sse2_loop:
+ movq xmm1, MMWORD PTR [rsi]
+ movq xmm2, MMWORD PTR [rsi + rdx]
+ movq xmm3, MMWORD PTR [rsi + rdx * 2]
+ movq xmm5, MMWORD PTR [rsi + rdx * 4]
+ add rsi, rdx
+ movq xmm4, MMWORD PTR [rsi + rdx * 2]
+ movq xmm6, MMWORD PTR [rsi + rdx * 4]
+
+ punpcklbw xmm1, xmm0
+ pmullw xmm1, [rax]
+
+ punpcklbw xmm2, xmm0
+ pmullw xmm2, [rax + 16]
+
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax + 32]
+
+ punpcklbw xmm5, xmm0
+ pmullw xmm5, [rax + 64]
+
+ punpcklbw xmm4, xmm0
+ pmullw xmm4, [rax + 48]
+
+ punpcklbw xmm6, xmm0
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_unpack_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int output_height,
+; unsigned int output_width
+;)
+global sym(vp9_unpack_block1d16_h6_sse2)
+sym(vp9_unpack_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(3) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ punpcklbw xmm1, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm1
+ movdqa XMMWORD Ptr [rdi + 16], xmm3
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(4) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_bilinear_predict16x16_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+extern sym(vp9_bilinear_filters_mmx)
+global sym(vp9_bilinear_predict16x16_sse2)
+sym(vp9_bilinear_predict16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = bilinear_filters_mmx[xoffset]
+ ;const short *VFilter = bilinear_filters_mmx[yoffset]
+
+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
+ movsxd rax, dword ptr arg(2) ;xoffset
+
+ cmp rax, 0 ;skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ cmp rax, 0 ;skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+%endif
+ ; get the first horizontal line done
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ add rsi, rdx ; next line
+.next_row:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, [rax]
+ pmullw xmm6, [rax]
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ pmullw xmm3, [rax+16]
+ pmullw xmm4, [rax+16]
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rdx ; next line
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ;dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ add rsi, rax ; next line
+.next_row_spo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ movdqa xmm4, xmm3 ; make a copy of current line
+ movdqa xmm7, xmm3
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm5, xmm1
+ pmullw xmm6, xmm1
+ pmullw xmm3, xmm2
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ;dst_pitch
+ cmp rdi, rcx
+ jne .next_row_spo
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.next_row_fpo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ; dst_pitch
+ cmp rdi, rcx
+ jne .next_row_fpo
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_bilinear_predict8x8_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+extern sym(vp9_bilinear_filters_mmx)
+global sym(vp9_bilinear_predict8x8_sse2)
+sym(vp9_bilinear_predict8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ ;const short *HFilter = bilinear_filters_mmx[xoffset]
+ ;const short *VFilter = bilinear_filters_mmx[yoffset]
+ lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm5, [rax]
+ movdqa xmm6, [rax+16]
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqa xmm3, XMMWORD PTR [rsp]
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ add rsp, 16 ; next line
+.next_row8x8:
+ movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+ pmullw xmm7, xmm5
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm4, xmm3
+
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm7
+
+ movdqa xmm7, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb xmm3, xmm0
+ movq [rdi], xmm3 ; store the results in the destination
+
+ add rsp, 16 ; next line
+ add rdi, rdx
+
+ cmp rdi, rcx
+ jne .next_row8x8
+
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 8 dw 0x40
--- /dev/null
+++ b/vp9/common/x86/subpixel_ssse3.asm
@@ -1,0 +1,1515 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP9_FILTER_WEIGHT 128
+%define VP9_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp9_filter_block1d8_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d8_h6_ssse3)
+sym(vp9_filter_block1d8_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4
+
+ movdqa xmm7, [GLOBAL(rd)]
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ mov rdi, arg(2) ;output_ptr
+
+ cmp esi, DWORD PTR [rax]
+ je vp9_filter_block1d8_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ pmaddubsw xmm1, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+ jnz .filter_block1d8_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+vp9_filter_block1d8_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+ movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm2, xmm0
+ pshufb xmm0, xmm3
+
+ pshufb xmm2, xmm4
+ pmaddubsw xmm0, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+
+ jnz .filter_block1d8_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp9_filter_block1d16_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d16_h6_ssse3)
+sym(vp9_filter_block1d16_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ mov rdi, arg(2) ;output_ptr
+
+ mov rsi, arg(0) ;src_ptr
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ movq xmm3, MMWORD PTR [rsi + 6]
+
+ pmaddubsw xmm1, xmm5
+ movq xmm7, MMWORD PTR [rsi + 11]
+
+ pmaddubsw xmm2, xmm6
+ punpcklbw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ movdqa xmm1, xmm3
+
+ pmaddubsw xmm3, xmm4
+ paddsw xmm0, xmm2
+
+ movdqa xmm2, xmm1
+ paddsw xmm0, [GLOBAL(rd)]
+
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+
+ psraw xmm0, 7
+ pmaddubsw xmm1, xmm5
+
+ pmaddubsw xmm2, xmm6
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ paddsw xmm3, xmm1
+
+ paddsw xmm3, xmm2
+
+ paddsw xmm3, [GLOBAL(rd)]
+
+ psraw xmm3, 7
+
+ packuswb xmm3, xmm3
+
+ punpcklqdq xmm0, xmm3
+
+ movdqa XMMWORD Ptr [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d4_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d4_h6_ssse3)
+sym(vp9_filter_block1d4_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ movdqa xmm7, [GLOBAL(rd)]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp9_filter_block1d4_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+ movdqu xmm0, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf1b)]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2b)]
+ pmaddubsw xmm0, xmm4
+ pshufb xmm2, [GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ pxor xmm1, xmm1
+ paddsw xmm0, xmm2
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movd DWORD PTR [rdi], xmm0
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp9_filter_block1d4_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+ movdqu xmm1, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+ pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm1, xmm7
+ paddsw xmm1, xmm2
+ psraw xmm1, 7
+ packuswb xmm1, xmm1
+
+ movd DWORD PTR [rdi], xmm1
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void vp9_filter_block1d16_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d16_v6_ssse3)
+sym(vp9_filter_block1d16_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ cmp esi, DWORD PTR [rax]
+ je .vp9_filter_block1d16_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+
+.vp9_filter_block1d16_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2 ;store the results
+
+ movq xmm1, MMWORD PTR [rsi + 8] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d16_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp9_filter_block1d16_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+.vp9_filter_block1d16_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ paddsw xmm2, [GLOBAL(rd)]
+ paddsw xmm2, xmm3
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ punpcklbw xmm5, xmm4 ;B D
+ punpcklbw xmm1, xmm0 ;C E
+
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm5, xmm7
+
+ movdqa xmm4, [GLOBAL(rd)]
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm5, xmm1
+ paddsw xmm5, xmm4
+ psraw xmm5, 7
+ packuswb xmm5, xmm5
+
+ punpcklqdq xmm2, xmm5
+
+ movdqa XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d16_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d8_v6_ssse3)
+sym(vp9_filter_block1d8_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp9_filter_block1d8_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp9_filter_block1d8_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+ movdqa xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d8_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp9_filter_block1d8_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm5, [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp9_filter_block1d8_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm5
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d8_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp9_filter_block1d4_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp9_filter_index
+;)
+global sym(vp9_filter_block1d4_v6_ssse3)
+sym(vp9_filter_block1d4_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp9_filter_block1d4_v4_ssse3
+
+ movq mm5, MMWORD PTR [rax] ;k0_k5
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp9_filter_block1d4_v6_ssse3_loop:
+ movd mm1, DWORD PTR [rsi] ;A
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ movd mm0, DWORD PTR [rax + rdx * 4] ;F
+
+ movq mm4, [GLOBAL(rd)]
+
+ pmaddubsw mm3, mm6
+ punpcklbw mm1, mm0 ;A F
+ pmaddubsw mm2, mm7
+ pmaddubsw mm1, mm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm1
+ paddsw mm2, mm4
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d4_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp9_filter_block1d4_v4_ssse3:
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+ movq mm5, MMWORD PTR [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp9_filter_block1d4_v4_ssse3_loop:
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ pmaddubsw mm3, mm6
+ pmaddubsw mm2, mm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm5
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp9_filter_block1d4_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_bilinear_predict16x16_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp9_bilinear_predict16x16_ssse3)
+sym(vp9_bilinear_predict16x16_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ lea rcx, [GLOBAL(bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(2) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
+
+ movdqa xmm2, [rax]
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ; dst_pitch
+%endif
+ movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
+
+ punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+ pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm6, xmm5
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm6, xmm1
+
+ punpcklbw xmm4, xmm5
+ pmaddubsw xmm4, xmm1
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
+
+ packuswb xmm6, xmm4
+ movdqa xmm5, xmm7
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm2
+
+ punpckhbw xmm7, xmm6
+ pmaddubsw xmm7, xmm2
+
+ paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
+ psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm5, xmm7
+ movdqa xmm7, xmm6
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ; dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+ ; get the first horizontal line done
+ movq xmm4, [rsi] ; load row 0
+ movq xmm2, [rsi + 8] ; load row 0
+
+ lea rsi, [rsi + rax] ; next line
+.next_row_sp:
+ movq xmm3, [rsi] ; load row + 1
+ movq xmm5, [rsi + 8] ; load row + 1
+
+ punpcklbw xmm4, xmm3
+ punpcklbw xmm2, xmm5
+
+ pmaddubsw xmm4, xmm1
+ movq xmm7, [rsi + rax] ; load row + 2
+
+ pmaddubsw xmm2, xmm1
+ movq xmm6, [rsi + rax + 8] ; load row + 2
+
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm3, xmm1
+ paddw xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm5, xmm1
+ paddw xmm2, [GLOBAL(rd)]
+
+ psraw xmm4, VP9_FILTER_SHIFT
+ psraw xmm2, VP9_FILTER_SHIFT
+
+ packuswb xmm4, xmm2
+ paddw xmm3, [GLOBAL(rd)]
+
+ movdqa [rdi], xmm4 ; store row 0
+ paddw xmm5, [GLOBAL(rd)]
+
+ psraw xmm3, VP9_FILTER_SHIFT
+ psraw xmm5, VP9_FILTER_SHIFT
+
+ packuswb xmm3, xmm5
+ movdqa xmm4, xmm7
+
+ movdqa [rdi + rdx],xmm3 ; store row 1
+ lea rsi, [rsi + 2*rax]
+
+ movdqa xmm2, xmm6
+ lea rdi, [rdi + 2*rdx]
+
+ cmp rdi, rcx
+ jne .next_row_sp
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+.next_row_fp:
+ movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm2, xmm4
+ movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ pmaddubsw xmm2, xmm1
+ movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rax] ; next line
+ punpcklbw xmm3, xmm4
+
+ pmaddubsw xmm3, xmm1
+ movq xmm5, [rsi]
+
+ paddw xmm2, [GLOBAL(rd)]
+ movq xmm7, [rsi+1]
+
+ movq xmm6, [rsi+8]
+ psraw xmm2, VP9_FILTER_SHIFT
+
+ punpcklbw xmm5, xmm7
+ movq xmm7, [rsi+9]
+
+ paddw xmm3, [GLOBAL(rd)]
+ pmaddubsw xmm5, xmm1
+
+ psraw xmm3, VP9_FILTER_SHIFT
+ punpcklbw xmm6, xmm7
+
+ packuswb xmm2, xmm3
+ pmaddubsw xmm6, xmm1
+
+ movdqa [rdi], xmm2 ; store the results in the destination
+ paddw xmm5, [GLOBAL(rd)]
+
+ lea rdi, [rdi + rdx] ; dst_pitch
+ psraw xmm5, VP9_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm6, VP9_FILTER_SHIFT
+
+ packuswb xmm5, xmm6
+ lea rsi, [rsi + rax] ; next line
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+ lea rdi, [rdi + rdx] ; dst_pitch
+
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_bilinear_predict8x8_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp9_bilinear_predict8x8_ssse3)
+sym(vp9_bilinear_predict8x8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ lea rcx, [GLOBAL(bilinear_filters_ssse3)]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ; xoffset
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b8x8_sp_only
+
+ shl rax, 4
+ add rax, rcx ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b8x8_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm1, [rax]
+
+ ; get the first horizontal line done
+ movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+ psrldq xmm5, 1
+ lea rsp, [rsp + 16] ; next line
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ lea rsp, [rsp + 16] ; next line
+
+ movdqa xmm5, xmm6
+
+ psrldq xmm5, 1
+
+ punpcklbw xmm6, xmm5
+ pmaddubsw xmm6, xmm0
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
+
+ packuswb xmm6, xmm6
+
+ punpcklbw xmm7, xmm6
+ pmaddubsw xmm7, xmm1
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm7, xmm7
+
+ movq [rdi], xmm7 ; store the results in the destination
+ lea rdi, [rdi + rdx]
+
+ movdqa xmm7, xmm6
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done8x8
+
+.b8x8_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax] ; VFilter
+
+ movq xmm1, XMMWORD PTR [rsp]
+ movq xmm2, XMMWORD PTR [rsp+16]
+
+ movq xmm3, XMMWORD PTR [rsp+32]
+ punpcklbw xmm1, xmm2
+
+ movq xmm4, XMMWORD PTR [rsp+48]
+ punpcklbw xmm2, xmm3
+
+ movq xmm5, XMMWORD PTR [rsp+64]
+ punpcklbw xmm3, xmm4
+
+ movq xmm6, XMMWORD PTR [rsp+80]
+ punpcklbw xmm4, xmm5
+
+ movq xmm7, XMMWORD PTR [rsp+96]
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm1, xmm0
+ pmaddubsw xmm2, xmm0
+
+ pmaddubsw xmm3, xmm0
+ pmaddubsw xmm4, xmm0
+
+ pmaddubsw xmm5, xmm0
+ punpcklbw xmm6, xmm7
+
+ pmaddubsw xmm6, xmm0
+ paddw xmm1, [GLOBAL(rd)]
+
+ paddw xmm2, [GLOBAL(rd)]
+ psraw xmm1, VP9_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm2, VP9_FILTER_SHIFT
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm3, VP9_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm4, VP9_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm5, VP9_FILTER_SHIFT
+
+ psraw xmm6, VP9_FILTER_SHIFT
+ packuswb xmm1, xmm1
+
+ packuswb xmm2, xmm2
+ movq [rdi], xmm1
+
+ packuswb xmm3, xmm3
+ movq [rdi+rdx], xmm2
+
+ packuswb xmm4, xmm4
+ movq xmm1, XMMWORD PTR [rsp+112]
+
+ lea rdi, [rdi + 2*rdx]
+ movq xmm2, XMMWORD PTR [rsp+128]
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm3
+
+ packuswb xmm6, xmm6
+ movq [rdi+rdx], xmm4
+
+ lea rdi, [rdi + 2*rdx]
+ punpcklbw xmm7, xmm1
+
+ movq [rdi], xmm5
+ pmaddubsw xmm7, xmm0
+
+ movq [rdi+rdx], xmm6
+ punpcklbw xmm1, xmm2
+
+ pmaddubsw xmm1, xmm0
+ paddw xmm7, [GLOBAL(rd)]
+
+ psraw xmm7, VP9_FILTER_SHIFT
+ paddw xmm1, [GLOBAL(rd)]
+
+ psraw xmm1, VP9_FILTER_SHIFT
+ packuswb xmm7, xmm7
+
+ packuswb xmm1, xmm1
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm7
+
+ movq [rdi+rdx], xmm1
+ lea rsp, [rsp + 144]
+
+ jmp .done8x8
+
+.b8x8_fp_only:
+ lea rcx, [rdi+rdx*8]
+
+.next_row_fp:
+ movdqa xmm1, XMMWORD PTR [rsp]
+ movdqa xmm3, XMMWORD PTR [rsp+16]
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, XMMWORD PTR [rsp+32]
+
+ psrldq xmm2, 1
+ movdqa xmm7, XMMWORD PTR [rsp+48]
+
+ movdqa xmm4, xmm3
+ psrldq xmm4, 1
+
+ movdqa xmm6, xmm5
+ psrldq xmm6, 1
+
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xmm0
+
+ punpcklbw xmm3, xmm4
+ pmaddubsw xmm3, xmm0
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm0
+
+ movdqa xmm2, xmm7
+ psrldq xmm2, 1
+
+ punpcklbw xmm7, xmm2
+ pmaddubsw xmm7, xmm0
+
+ paddw xmm1, [GLOBAL(rd)]
+ psraw xmm1, VP9_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm3, VP9_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm5, VP9_FILTER_SHIFT
+
+ paddw xmm7, [GLOBAL(rd)]
+ psraw xmm7, VP9_FILTER_SHIFT
+
+ packuswb xmm1, xmm1
+ packuswb xmm3, xmm3
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm1
+
+ packuswb xmm7, xmm7
+ movq [rdi+rdx], xmm3
+
+ lea rdi, [rdi + 2*rdx]
+ movq [rdi], xmm5
+
+ lea rsp, [rsp + 4*16]
+ movq [rdi+rdx], xmm7
+
+ lea rdi, [rdi + 2*rdx]
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+ lea rsp, [rsp + 16]
+
+.done8x8:
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+ db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+ db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+ db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+ db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+ times 8 dw 0x40
+
+align 16
+k0_k5:
+ times 8 db 0, 0 ;placeholder
+ times 8 db 0, 0
+ times 8 db 2, 1
+ times 8 db 0, 0
+ times 8 db 3, 3
+ times 8 db 0, 0
+ times 8 db 1, 2
+ times 8 db 0, 0
+k1_k3:
+ times 8 db 0, 0 ;placeholder
+ times 8 db -6, 12
+ times 8 db -11, 36
+ times 8 db -9, 50
+ times 8 db -16, 77
+ times 8 db -6, 93
+ times 8 db -8, 108
+ times 8 db -1, 123
+k2_k4:
+ times 8 db 128, 0 ;placeholder
+ times 8 db 123, -1
+ times 8 db 108, -8
+ times 8 db 93, -6
+ times 8 db 77, -16
+ times 8 db 50, -9
+ times 8 db 36, -11
+ times 8 db 12, -6
+align 16
+bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 120, 8
+ times 8 db 112, 16
+ times 8 db 104, 24
+ times 8 db 96, 32
+ times 8 db 88, 40
+ times 8 db 80, 48
+ times 8 db 72, 56
+ times 8 db 64, 64
+ times 8 db 56, 72
+ times 8 db 48, 80
+ times 8 db 40, 88
+ times 8 db 32, 96
+ times 8 db 24, 104
+ times 8 db 16, 112
+ times 8 db 8, 120
+
--- /dev/null
+++ b/vp9/common/x86/subpixel_x86.h
@@ -1,0 +1,122 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_X86_H
+#define SUBPIXEL_X86_H
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#if HAVE_MMX
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
+
+#undef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
+
+#undef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
+
+#undef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
+
+#undef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
+
+#undef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
+
+#undef vp9_subpix_bilinear8x4
+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
+
+#undef vp9_subpix_bilinear4x4
+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
+
+#undef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
+
+#undef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
+
+#undef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
+
+#undef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
+
+#endif
+#endif
+
+#if HAVE_SSSE3
+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_subpix_sixtap16x16
+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
+
+#undef vp9_subpix_sixtap8x8
+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
+
+#undef vp9_subpix_sixtap8x4
+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
+
+#undef vp9_subpix_sixtap4x4
+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
+
+
+#undef vp9_subpix_bilinear16x16
+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
+
+#undef vp9_subpix_bilinear8x8
+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
+
+#endif
+#endif
+
+
+
+#endif
--- /dev/null
+++ b/vp9/common/x86/vp8_asm_stubs.c
@@ -1,0 +1,602 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/subpixel.h"
+
+extern const short vp9_six_tap_mmx[16][6 * 8];
+
+extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
+
+extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width);
+
+extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_lin,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
+
+#if HAVE_MMX
+void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict4x4_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
+ const short *hfilter, *vfilter;
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 9, 8, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
+ 8, 4, 4, 4, vfilter);
+}
+
+void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
+ fdata2 + 8, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
+ fdata2 + 12, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+}
+
+void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 13, 16,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 13, 16,
+ hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 8, 8, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+ 16, 8, 8, 8, vfilter);
+}
+
+void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 4, 8, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+ 16, 8, 4, 8, vfilter);
+}
+
+void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ vp9_bilinear_predict8x8_mmx(src_ptr,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + 8, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + dst_pitch * 8, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+#if HAVE_SSE2
+void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 21, 32, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 21, 32);
+ vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, dst_pitch, vfilter);
+ }
+}
+
+void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 13, 16, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 8, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, vfilter);
+ }
+}
+
+void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 9, 16, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 4, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, vfilter);
+ }
+}
+#endif
+
+#if HAVE_SSSE3
+extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ fdata2, 16, 21, xoffset);
+ vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
+ 16, yoffset);
+ } else {
+ /* First-pass only */
+ vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, yoffset);
+ }
+}
+
+void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 8, 13, xoffset);
+ vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
+ } else {
+ vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, yoffset);
+ }
+}
+
+void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 8, 9, xoffset);
+ vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ /* First-pass only */
+ vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+}
+
+void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict4x4_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 4, 9, xoffset);
+ vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ } else {
+ vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+}
+
+void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 23, hfilter_aligned16);
+ vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
+ 16, hfilter_aligned16);
+ } else {
+ vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 16, vfilter_aligned16);
+ }
+ }
+}
+
+void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 15, hfilter_aligned16);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
+ hfilter_aligned16);
+ } else {
+ vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 8, vfilter_aligned16);
+ }
+ }
+}
+
+void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 11, hfilter_aligned16);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
+ hfilter_aligned16);
+ } else {
+ vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 4, vfilter_aligned16);
+ }
+ }
+}
+#endif
--- /dev/null
+++ b/vp9/common/x86/x86_systemdependent.c
@@ -1,0 +1,108 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/common/subpixel.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/pragmas.h"
+#include "vp9/common/onyxc_int.h"
+
+void vp9_arch_x86_common_init(VP9_COMMON *ctx) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
+ int flags = x86_simd_caps();
+
+ /* Note:
+ *
+ * This platform can be built without runtime CPU detection as well. If
+ * you modify any of the function mappings present in this file, be sure
+ * to also update them in static mapings (<arch>/filename_<arch>.h)
+ */
+
+ /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+// The commented functions need to be re-written for vpx.
+ if (flags & HAS_MMX) {
+ rtcd->idct.idct1 = vp9_short_idct4x4llm_1_mmx;
+ rtcd->idct.idct16 = vp9_short_idct4x4llm_mmx;
+ rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;
+ // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_mmx;
+ // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_mmx;
+
+ /* Disabled due to unsupported enhanced interpolation/high_prec mv
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_mmx;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_mmx;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_mmx;
+ rtcd->subpix.sixtap4x4 = vp9_sixtap_predict4x4_mmx;
+ */
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_mmx;
+ rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_mmx;
+ rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_mmx;
+
+#if CONFIG_POSTPROC
+ rtcd->postproc.down = vp9_mbpost_proc_down_mmx;
+ /*rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;*/
+ rtcd->postproc.downacross = vp9_post_proc_down_and_across_mmx;
+ rtcd->postproc.addnoise = vp9_plane_add_noise_mmx;
+#endif
+ }
+
+#endif
+#if HAVE_SSE2
+
+ if (flags & HAS_SSE2) {
+
+
+ // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_sse2;
+
+ /* Disabled due to unsupported enhanced interpolation/high_prec mv
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_sse2;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_sse2;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_sse2;
+ */
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_sse2;
+
+#if CONFIG_POSTPROC
+ rtcd->postproc.down = vp9_mbpost_proc_down_xmm;
+ rtcd->postproc.across = vp9_mbpost_proc_across_ip_xmm;
+ rtcd->postproc.downacross = vp9_post_proc_down_and_across_xmm;
+ rtcd->postproc.addnoise = vp9_plane_add_noise_wmt;
+#endif
+ }
+
+#endif
+
+#if HAVE_SSSE3
+
+ if (flags & HAS_SSSE3) {
+ /* Disabled due to unsupported enhanced interpolation/high_prec mv
+ rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_ssse3;
+ rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_ssse3;
+ rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_ssse3;
+ rtcd->subpix.sixtap4x4 = vp9_sixtap_predict4x4_ssse3;
+ rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;
+ rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_ssse3;
+ */
+
+ /* these are disable because of unsupported diagonal pred modes
+ rtcd->recon.build_intra_predictors_mbuv =
+ vp9_build_intra_predictors_mbuv_ssse3;
+ rtcd->recon.build_intra_predictors_mbuv_s =
+ vp9_build_intra_predictors_mbuv_s_ssse3;
+ */
+ }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -1,0 +1,218 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_dc_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride, int Dc)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch ; +4 = 40
+; sp + 40 = stride ; +4 = 44
+; sp + 44 = Dc ; +4 = 48
+
+
+|vp8_dequant_dc_idct_add_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r6, [sp, #44]
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r3, [sp]
+
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ mov r12, #3
+
+vp8_dequant_dc_add_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne vp8_dequant_dc_add_loop
+
+ sub r0, r0, #32
+ mov r1, r0
+
+; short_idct4x4llm_v6_dual
+ ldr r3, cospi8sqrt2minus1
+ ldr r4, sinpi8sqrt2
+ ldr r6, [r0, #8]
+ mov r5, #2
+vp8_dequant_dc_idct_loop1_v6
+ ldr r12, [r0, #24]
+ ldr r14, [r0, #16]
+ smulwt r9, r3, r6
+ smulwb r7, r3, r6
+ smulwt r10, r4, r6
+ smulwb r8, r4, r6
+ pkhbt r7, r7, r9, lsl #16
+ smulwt r11, r3, r12
+ pkhbt r8, r8, r10, lsl #16
+ uadd16 r6, r6, r7
+ smulwt r7, r4, r12
+ smulwb r9, r3, r12
+ smulwb r10, r4, r12
+ subs r5, r5, #1
+ pkhbt r9, r9, r11, lsl #16
+ ldr r11, [r0], #4
+ pkhbt r10, r10, r7, lsl #16
+ uadd16 r7, r12, r9
+ usub16 r7, r8, r7
+ uadd16 r6, r6, r10
+ uadd16 r10, r11, r14
+ usub16 r8, r11, r14
+ uadd16 r9, r10, r6
+ usub16 r10, r10, r6
+ uadd16 r6, r8, r7
+ usub16 r7, r8, r7
+ str r6, [r1, #8]
+ ldrne r6, [r0, #8]
+ str r7, [r1, #16]
+ str r10, [r1, #24]
+ str r9, [r1], #4
+ bne vp8_dequant_dc_idct_loop1_v6
+
+ mov r5, #2
+ sub r0, r1, #8
+vp8_dequant_dc_idct_loop2_v6
+ ldr r6, [r0], #4
+ ldr r7, [r0], #4
+ ldr r8, [r0], #4
+ ldr r9, [r0], #4
+ smulwt r1, r3, r6
+ smulwt r12, r4, r6
+ smulwt lr, r3, r8
+ smulwt r10, r4, r8
+ pkhbt r11, r8, r6, lsl #16
+ pkhbt r1, lr, r1, lsl #16
+ pkhbt r12, r10, r12, lsl #16
+ pkhtb r6, r6, r8, asr #16
+ uadd16 r6, r1, r6
+ pkhbt lr, r9, r7, lsl #16
+ uadd16 r10, r11, lr
+ usub16 lr, r11, lr
+ pkhtb r8, r7, r9, asr #16
+ subs r5, r5, #1
+ smulwt r1, r3, r8
+ smulwb r7, r3, r8
+ smulwt r11, r4, r8
+ smulwb r9, r4, r8
+ pkhbt r1, r7, r1, lsl #16
+ uadd16 r8, r1, r8
+ pkhbt r11, r9, r11, lsl #16
+ usub16 r1, r12, r8
+ uadd16 r8, r11, r6
+ ldr r9, c0x00040004
+ ldr r12, [sp, #40]
+ uadd16 r6, r10, r8
+ usub16 r7, r10, r8
+ uadd16 r7, r7, r9
+ uadd16 r6, r6, r9
+ uadd16 r10, r14, r1
+ usub16 r1, r14, r1
+ uadd16 r10, r10, r9
+ uadd16 r1, r1, r9
+ ldr r11, [r2], r12
+ mov r8, r7, asr #3
+ pkhtb r9, r8, r10, asr #19
+ mov r8, r1, asr #3
+ pkhtb r8, r8, r6, asr #19
+ uxtb16 lr, r11, ror #8
+ qadd16 r9, r9, lr
+ uxtb16 lr, r11
+ qadd16 r8, r8, lr
+ usat16 r9, #8, r9
+ usat16 r8, #8, r8
+ orr r9, r8, r9, lsl #8
+ ldr r11, [r2], r12
+ ldr lr, [sp]
+ ldr r12, [sp, #44]
+ mov r7, r7, lsl #16
+ mov r1, r1, lsl #16
+ mov r10, r10, lsl #16
+ mov r6, r6, lsl #16
+ mov r7, r7, asr #3
+ pkhtb r7, r7, r10, asr #19
+ mov r1, r1, asr #3
+ pkhtb r1, r1, r6, asr #19
+ uxtb16 r8, r11, ror #8
+ qadd16 r7, r7, r8
+ uxtb16 r8, r11
+ qadd16 r1, r1, r8
+ usat16 r7, #8, r7
+ usat16 r1, #8, r1
+ orr r1, r1, r7, lsl #8
+ str r9, [lr], r12
+ str r1, [lr], r12
+ str lr, [sp]
+ bne vp8_dequant_dc_idct_loop2_v6
+
+; vpx_memset
+ sub r0, r0, #32
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_dequant_dc_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2 DCD 0x00008A8C
+c0x00040004 DCD 0x00040004
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequant_idct_v6.asm
@@ -1,0 +1,196 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dequant_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
+; r0 = input
+; r1 = dq
+; r2 = pred
+; r3 = dest
+; sp + 36 = pitch ; +4 = 40
+; sp + 40 = stride ; +4 = 44
+
+
+|vp8_dequant_idct_add_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r3, [sp]
+
+ mov r12, #4
+
+vp8_dequant_add_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne vp8_dequant_add_loop
+
+ sub r0, r0, #32
+ mov r1, r0
+
+; short_idct4x4llm_v6_dual
+ ldr r3, cospi8sqrt2minus1
+ ldr r4, sinpi8sqrt2
+ ldr r6, [r0, #8]
+ mov r5, #2
+vp8_dequant_idct_loop1_v6
+ ldr r12, [r0, #24]
+ ldr r14, [r0, #16]
+ smulwt r9, r3, r6
+ smulwb r7, r3, r6
+ smulwt r10, r4, r6
+ smulwb r8, r4, r6
+ pkhbt r7, r7, r9, lsl #16
+ smulwt r11, r3, r12
+ pkhbt r8, r8, r10, lsl #16
+ uadd16 r6, r6, r7
+ smulwt r7, r4, r12
+ smulwb r9, r3, r12
+ smulwb r10, r4, r12
+ subs r5, r5, #1
+ pkhbt r9, r9, r11, lsl #16
+ ldr r11, [r0], #4
+ pkhbt r10, r10, r7, lsl #16
+ uadd16 r7, r12, r9
+ usub16 r7, r8, r7
+ uadd16 r6, r6, r10
+ uadd16 r10, r11, r14
+ usub16 r8, r11, r14
+ uadd16 r9, r10, r6
+ usub16 r10, r10, r6
+ uadd16 r6, r8, r7
+ usub16 r7, r8, r7
+ str r6, [r1, #8]
+ ldrne r6, [r0, #8]
+ str r7, [r1, #16]
+ str r10, [r1, #24]
+ str r9, [r1], #4
+ bne vp8_dequant_idct_loop1_v6
+
+ mov r5, #2
+ sub r0, r1, #8
+vp8_dequant_idct_loop2_v6
+ ldr r6, [r0], #4
+ ldr r7, [r0], #4
+ ldr r8, [r0], #4
+ ldr r9, [r0], #4
+ smulwt r1, r3, r6
+ smulwt r12, r4, r6
+ smulwt lr, r3, r8
+ smulwt r10, r4, r8
+ pkhbt r11, r8, r6, lsl #16
+ pkhbt r1, lr, r1, lsl #16
+ pkhbt r12, r10, r12, lsl #16
+ pkhtb r6, r6, r8, asr #16
+ uadd16 r6, r1, r6
+ pkhbt lr, r9, r7, lsl #16
+ uadd16 r10, r11, lr
+ usub16 lr, r11, lr
+ pkhtb r8, r7, r9, asr #16
+ subs r5, r5, #1
+ smulwt r1, r3, r8
+ smulwb r7, r3, r8
+ smulwt r11, r4, r8
+ smulwb r9, r4, r8
+ pkhbt r1, r7, r1, lsl #16
+ uadd16 r8, r1, r8
+ pkhbt r11, r9, r11, lsl #16
+ usub16 r1, r12, r8
+ uadd16 r8, r11, r6
+ ldr r9, c0x00040004
+ ldr r12, [sp, #40]
+ uadd16 r6, r10, r8
+ usub16 r7, r10, r8
+ uadd16 r7, r7, r9
+ uadd16 r6, r6, r9
+ uadd16 r10, r14, r1
+ usub16 r1, r14, r1
+ uadd16 r10, r10, r9
+ uadd16 r1, r1, r9
+ ldr r11, [r2], r12
+ mov r8, r7, asr #3
+ pkhtb r9, r8, r10, asr #19
+ mov r8, r1, asr #3
+ pkhtb r8, r8, r6, asr #19
+ uxtb16 lr, r11, ror #8
+ qadd16 r9, r9, lr
+ uxtb16 lr, r11
+ qadd16 r8, r8, lr
+ usat16 r9, #8, r9
+ usat16 r8, #8, r8
+ orr r9, r8, r9, lsl #8
+ ldr r11, [r2], r12
+ ldr lr, [sp]
+ ldr r12, [sp, #44]
+ mov r7, r7, lsl #16
+ mov r1, r1, lsl #16
+ mov r10, r10, lsl #16
+ mov r6, r6, lsl #16
+ mov r7, r7, asr #3
+ pkhtb r7, r7, r10, asr #19
+ mov r1, r1, asr #3
+ pkhtb r1, r1, r6, asr #19
+ uxtb16 r8, r11, ror #8
+ qadd16 r7, r7, r8
+ uxtb16 r8, r11
+ qadd16 r1, r1, r8
+ usat16 r7, #8, r7
+ usat16 r1, #8, r1
+ orr r1, r1, r7, lsl #8
+ str r9, [lr], r12
+ str r1, [lr], r12
+ str lr, [sp]
+ bne vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+ sub r0, r0, #32
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2 DCD 0x00008A8C
+c0x00040004 DCD 0x00040004
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/dequantize_v6.asm
@@ -1,0 +1,69 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_v6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------
+;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+ stmdb sp!, {r4-r9, lr}
+
+ ldr r3, [r0] ;load Q
+ ldr r4, [r1] ;load DQC
+ ldr r5, [r0, #4]
+ ldr r6, [r1, #4]
+
+ mov r12, #2 ;loop counter
+
+dequant_loop
+ smulbb r7, r3, r4 ;multiply
+ smultt r8, r3, r4
+ smulbb r9, r5, r6
+ smultt lr, r5, r6
+
+ ldr r3, [r0, #8]
+ ldr r4, [r1, #8]
+ ldr r5, [r0, #12]
+ ldr r6, [r1, #12]
+
+ strh r7, [r2], #2 ;store result
+ smulbb r7, r3, r4 ;multiply
+ strh r8, [r2], #2
+ smultt r8, r3, r4
+ strh r9, [r2], #2
+ smulbb r9, r5, r6
+ strh lr, [r2], #2
+ smultt lr, r5, r6
+
+ subs r12, r12, #1
+
+ add r0, r0, #16
+ add r1, r1, #16
+
+ ldrne r3, [r0]
+ strh r7, [r2], #2 ;store result
+ ldrne r4, [r1]
+ strh r8, [r2], #2
+ ldrne r5, [r0, #4]
+ strh r9, [r2], #2
+ ldrne r6, [r1, #4]
+ strh lr, [r2], #2
+
+ bne dequant_loop
+
+ ldmia sp!, {r4-r9, pc}
+ ENDP ;|vp8_dequantize_b_loop_v6|
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/armv6/idct_blk_v6.c
@@ -1,0 +1,136 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp8_dequant_dc_idct_add_y_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs, short *dc) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (eobs[0] > 1)
+ vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
+ else
+ vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
+
+ if (eobs[1] > 1)
+ vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
+ else
+ vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
+
+ if (eobs[2] > 1)
+ vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
+ else
+ vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
+
+ if (eobs[3] > 1)
+ vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
+ else
+ vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
+
+ q += 64;
+ dc += 4;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_y_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ if (eobs[2] > 1)
+ vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
+ ((int *)(q + 32))[0] = 0;
+ }
+
+ if (eobs[3] > 1)
+ vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
+ ((int *)(q + 48))[0] = 0;
+ }
+
+ q += 64;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_v6
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ q += 32;
+ pre += 32;
+ dstu += 4 * stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++) {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
+ else {
+ vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ q += 32;
+ pre += 32;
+ dstv += 4 * stride;
+ eobs += 2;
+ }
+}
--- /dev/null
+++ b/vp9/decoder/arm/dequantize_arm.c
@@ -1,0 +1,44 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/decoder/dequantize.h"
+#include "vp9/common/idct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_ARMV7
+extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV6
+extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV7
+
+void vp9_dequantize_b_neon(BLOCKD *d) {
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = d->dequant;
+
+ vp9_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_ARMV6
+void vp9_dequantize_b_v6(BLOCKD *d) {
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = d->dequant;
+
+ vp9_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
--- /dev/null
+++ b/vp9/decoder/arm/neon/dequant_idct_neon.asm
@@ -1,0 +1,129 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
+; unsigned char *dest, int pitch, int stride)
+; r0 short *input,
+; r1 short *dq,
+; r2 unsigned char *pred
+; r3 unsigned char *dest
+; sp int pitch
+; sp+4 int stride
+
+|vp8_dequant_idct_add_neon| PROC
+ vld1.16 {q3, q4}, [r0]
+ vld1.16 {q5, q6}, [r1]
+ ldr r1, [sp] ; pitch
+ vld1.32 {d14[0]}, [r2], r1
+ vld1.32 {d14[1]}, [r2], r1
+ vld1.32 {d15[0]}, [r2], r1
+ vld1.32 {d15[1]}, [r2]
+
+ ldr r1, [sp, #4] ; stride
+
+ adr r12, cospi8sqrt2minus1 ; pointer to the first constant
+
+ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
+ vmul.i16 q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+ vld1.16 {d0}, [r12]
+ vswp d3, d4 ;q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+ vmov.i16 q14, #0
+
+ vswp d3, d4
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vmov q15, q14
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vst1.16 {q14, q15}, [r0]
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vaddw.u8 q1, q1, d14
+ vaddw.u8 q2, q2, d15
+
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+
+ vst1.32 {d0[0]}, [r3], r1
+ vst1.32 {d0[1]}, [r3], r1
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r3]
+
+ bx lr
+
+ ENDP ; |vp8_dequant_idct_add_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/neon/dequantizeb_neon.asm
@@ -1,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+ vld1.16 {q0, q1}, [r0]
+ vld1.16 {q2, q3}, [r1]
+
+ vmul.i16 q4, q0, q2
+ vmul.i16 q5, q1, q3
+
+ vst1.16 {q4, q5}, [r2]
+
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_blk_neon.c
@@ -1,0 +1,110 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_dc_full_2x_neon
+(short *input, short *dq, unsigned char *pre, unsigned char *dst,
+ int stride, short *dc);
+void idct_dequant_dc_0_2x_neon
+(short *dc, unsigned char *pre, unsigned char *dst, int stride);
+void idct_dequant_full_2x_neon
+(short *q, short *dq, unsigned char *pre, unsigned char *dst,
+ int pitch, int stride);
+void idct_dequant_0_2x_neon
+(short *q, short dq, unsigned char *pre, int pitch,
+ unsigned char *dst, int stride);
+
+void vp8_dequant_dc_idct_add_y_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs, short *dc) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
+ else
+ idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
+
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
+ else
+ idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
+
+ q += 64;
+ dc += 4;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_y_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dst, int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
+
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
+ else
+ idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
+
+ q += 64;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_neon
+(short *q, short *dq, unsigned char *pre,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
+
+ q += 32;
+ pre += 32;
+ dstu += 4 * stride;
+
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
+
+ q += 32;
+ pre += 32;
+
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
+
+ q += 32;
+ pre += 32;
+ dstv += 4 * stride;
+
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
+ else
+ idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
+}
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -1,0 +1,79 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
+; int pitch, unsigned char *dst, int stride);
+; r0 *q
+; r1 dq
+; r2 *pre
+; r3 pitch
+; sp *dst
+; sp+4 stride
+|idct_dequant_0_2x_neon| PROC
+ add r12, r2, #4
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d4[1]}, [r2]
+ vld1.32 {d8[0]}, [r12], r3
+ vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d10[0]}, [r12], r3
+ vld1.32 {d10[1]}, [r12]
+
+ ldrh r12, [r0] ; lo q
+ ldrh r2, [r0, #32] ; hi q
+ mov r3, #0
+ strh r3, [r0]
+ strh r3, [r0, #32]
+
+ sxth r12, r12 ; lo
+ mul r0, r12, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q0, r0
+ sxth r2, r2 ; hi
+ mul r0, r2, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ ldr r2, [sp] ; dst
+ ldr r3, [sp, #4] ; stride
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ add r0, r2, #4
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d10[1]}, [r0]
+
+ bx lr
+
+ ENDP ; |idct_dequant_0_2x_neon|
+ END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -1,0 +1,69 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_dc_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+; unsigned char *dst, int stride);
+; r0 *dc
+; r1 *pre
+; r2 *dst
+; r3 stride
+|idct_dequant_dc_0_2x_neon| PROC
+ ldr r0, [r0] ; *dc
+ mov r12, #16
+
+ vld1.32 {d2[0]}, [r1], r12 ; lo
+ vld1.32 {d2[1]}, [r1], r12
+ vld1.32 {d4[0]}, [r1], r12
+ vld1.32 {d4[1]}, [r1]
+ sub r1, r1, #44
+ vld1.32 {d8[0]}, [r1], r12 ; hi
+ vld1.32 {d8[1]}, [r1], r12
+ vld1.32 {d10[0]}, [r1], r12
+ vld1.32 {d10[1]}, [r1]
+
+ sxth r1, r0 ; lo *dc
+ add r1, r1, #4
+ asr r1, r1, #3
+ vdup.16 q0, r1
+ sxth r0, r0, ror #16 ; hi *dc
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ add r0, r2, #4
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d10[1]}, [r0]
+
+ bx lr
+
+ ENDP ;|idct_dequant_dc_0_2x_neon|
+ END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -1,0 +1,205 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_dc_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+; unsigned char *dst, int stride, short *dc);
+; r0 *q,
+; r1 *dq,
+; r2 *pre
+; r3 *dst
+; sp stride
+; sp+4 *dc
+|idct_dequant_dc_full_2x_neon| PROC
+ vld1.16 {q0, q1}, [r1] ; dq (same l/r)
+ vld1.16 {q2, q3}, [r0] ; l q
+ mov r1, #16 ; pitch
+ add r0, r0, #32
+ vld1.16 {q4, q5}, [r0] ; r q
+ add r12, r2, #4
+ ; interleave the predictors
+ vld1.32 {d28[0]}, [r2], r1 ; l pre
+ vld1.32 {d28[1]}, [r12], r1 ; r pre
+ vld1.32 {d29[0]}, [r2], r1
+ vld1.32 {d29[1]}, [r12], r1
+ vld1.32 {d30[0]}, [r2], r1
+ vld1.32 {d30[1]}, [r12], r1
+ vld1.32 {d31[0]}, [r2]
+ ldr r1, [sp, #4]
+ vld1.32 {d31[1]}, [r12]
+
+ adr r2, cospi8sqrt2minus1 ; pointer to the first constant
+
+ ldrh r12, [r1], #2 ; lo *dc
+ ldrh r1, [r1] ; hi *dc
+
+ ; dequant: q[i] = q[i] * dq[i]
+ vmul.i16 q2, q2, q0
+ vmul.i16 q3, q3, q1
+ vmul.i16 q4, q4, q0
+ vmul.i16 q5, q5, q1
+
+ ; move dc up to neon and overwrite first element
+ vmov.16 d4[0], r12
+ vmov.16 d8[0], r1
+
+ vld1.16 {d0}, [r2]
+
+ ; q2: l0r0 q3: l8r8
+ ; q4: l4r4 q5: l12r12
+ vswp d5, d8
+ vswp d7, d10
+
+ ; _CONSTANTS_ * 4,12 >> 16
+ ; q6: 4 * sinpi : c1/temp1
+ ; q7: 12 * sinpi : d1/temp2
+ ; q8: 4 * cospi
+ ; q9: 12 * cospi
+ vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q7, q5, d0[2]
+ vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q9, q5, d0[0]
+
+ vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
+ vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
+
+ ; vqdmulh only accepts signed values. this was a problem because
+ ; our constant had the high bit set, and was treated as a negative value.
+ ; vqdmulh also doubles the value before it shifts by 16. we need to
+ ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+ ; so we can shift the constant without losing precision. this avoids
+ ; shift again afterward, but also avoids the sign issue. win win!
+ ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+ ; pre-shift it
+ vshr.s16 q8, q8, #1
+ vshr.s16 q9, q9, #1
+
+ ; q4: 4 + 4 * cospi : d1/temp1
+ ; q5: 12 + 12 * cospi : c1/temp2
+ vqadd.s16 q4, q4, q8
+ vqadd.s16 q5, q5, q9
+
+ ; c1 = temp1 - temp2
+ ; d1 = temp1 + temp2
+ vqsub.s16 q2, q6, q5
+ vqadd.s16 q3, q4, q7
+
+ ; [0]: a1+d1
+ ; [1]: b1+c1
+ ; [2]: b1-c1
+ ; [3]: a1-d1
+ vqadd.s16 q4, q10, q3
+ vqadd.s16 q5, q11, q2
+ vqsub.s16 q6, q11, q2
+ vqsub.s16 q7, q10, q3
+
+ ; rotate
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+ ; idct loop 2
+ ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+ ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+ ; q6: l 2, 6,10,14 r 2, 6,10,14
+ ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+ ; q8: 1 * sinpi : c1/temp1
+ ; q9: 3 * sinpi : d1/temp2
+ ; q10: 1 * cospi
+ ; q11: 3 * cospi
+ vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q9, q7, d0[2]
+ vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q11, q7, d0[0]
+
+ vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
+ vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
+
+ ; see note on shifting above
+ vshr.s16 q10, q10, #1
+ vshr.s16 q11, q11, #1
+
+ ; q10: 1 + 1 * cospi : d1/temp1
+ ; q11: 3 + 3 * cospi : c1/temp2
+ vqadd.s16 q10, q5, q10
+ vqadd.s16 q11, q7, q11
+
+ ; q8: c1 = temp1 - temp2
+ ; q9: d1 = temp1 + temp2
+ vqsub.s16 q8, q8, q11
+ vqadd.s16 q9, q10, q9
+
+ ; a1+d1
+ ; b1+c1
+ ; b1-c1
+ ; a1-d1
+ vqadd.s16 q4, q2, q9
+ vqadd.s16 q5, q3, q8
+ vqsub.s16 q6, q3, q8
+ vqsub.s16 q7, q2, q9
+
+ ; +4 >> 3 (rounding)
+ vrshr.s16 q4, q4, #3 ; lo
+ vrshr.s16 q5, q5, #3
+ vrshr.s16 q6, q6, #3 ; hi
+ vrshr.s16 q7, q7, #3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; adding pre
+ ; input is still packed. pre was read interleaved
+ vaddw.u8 q4, q4, d28
+ vaddw.u8 q5, q5, d29
+ vaddw.u8 q6, q6, d30
+ vaddw.u8 q7, q7, d31
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ ;saturate and narrow
+ vqmovun.s16 d0, q4 ; lo
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6 ; hi
+ vqmovun.s16 d3, q7
+
+ ldr r1, [sp] ; stride
+ add r2, r3, #4 ; hi
+ vst1.32 {d0[0]}, [r3], r1 ; lo
+ vst1.32 {d0[1]}, [r2], r1 ; hi
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r2], r1
+ vst1.32 {d2[0]}, [r3], r1
+ vst1.32 {d2[1]}, [r2], r1
+ vst1.32 {d3[0]}, [r3]
+ vst1.32 {d3[1]}, [r2]
+
+ bx lr
+
+ ENDP ; |idct_dequant_dc_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2 DCD 0x4546
+
+ END
--- /dev/null
+++ b/vp9/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -1,0 +1,197 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
+; unsigned char *dst, int pitch, int stride);
+; r0 *q,
+; r1 *dq,
+; r2 *pre
+; r3 *dst
+; sp pitch
+; sp+4 stride
+|idct_dequant_full_2x_neon| PROC
+ vld1.16 {q0, q1}, [r1] ; dq (same l/r)
+ vld1.16 {q2, q3}, [r0] ; l q
+ ldr r1, [sp] ; pitch
+ add r0, r0, #32
+ vld1.16 {q4, q5}, [r0] ; r q
+ add r12, r2, #4
+ ; interleave the predictors
+ vld1.32 {d28[0]}, [r2], r1 ; l pre
+ vld1.32 {d28[1]}, [r12], r1 ; r pre
+ vld1.32 {d29[0]}, [r2], r1
+ vld1.32 {d29[1]}, [r12], r1
+ vld1.32 {d30[0]}, [r2], r1
+ vld1.32 {d30[1]}, [r12], r1
+ vld1.32 {d31[0]}, [r2]
+ vld1.32 {d31[1]}, [r12]
+
+ adr r2, cospi8sqrt2minus1 ; pointer to the first constant
+
+ ; dequant: q[i] = q[i] * dq[i]
+ vmul.i16 q2, q2, q0
+ vmul.i16 q3, q3, q1
+ vmul.i16 q4, q4, q0
+ vmul.i16 q5, q5, q1
+
+ vld1.16 {d0}, [r2]
+
+ ; q2: l0r0 q3: l8r8
+ ; q4: l4r4 q5: l12r12
+ vswp d5, d8
+ vswp d7, d10
+
+ ; _CONSTANTS_ * 4,12 >> 16
+ ; q6: 4 * sinpi : c1/temp1
+ ; q7: 12 * sinpi : d1/temp2
+ ; q8: 4 * cospi
+ ; q9: 12 * cospi
+ vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q7, q5, d0[2]
+ vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q9, q5, d0[0]
+
+ vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
+ vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
+
+ ; vqdmulh only accepts signed values. this was a problem because
+ ; our constant had the high bit set, and was treated as a negative value.
+ ; vqdmulh also doubles the value before it shifts by 16. we need to
+ ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+ ; so we can shift the constant without losing precision. this avoids
+ ; shift again afterward, but also avoids the sign issue. win win!
+ ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+ ; pre-shift it
+ vshr.s16 q8, q8, #1
+ vshr.s16 q9, q9, #1
+
+ ; q4: 4 + 4 * cospi : d1/temp1
+ ; q5: 12 + 12 * cospi : c1/temp2
+ vqadd.s16 q4, q4, q8
+ vqadd.s16 q5, q5, q9
+
+ ; c1 = temp1 - temp2
+ ; d1 = temp1 + temp2
+ vqsub.s16 q2, q6, q5
+ vqadd.s16 q3, q4, q7
+
+ ; [0]: a1+d1
+ ; [1]: b1+c1
+ ; [2]: b1-c1
+ ; [3]: a1-d1
+ vqadd.s16 q4, q10, q3
+ vqadd.s16 q5, q11, q2
+ vqsub.s16 q6, q11, q2
+ vqsub.s16 q7, q10, q3
+
+ ; rotate
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+ ; idct loop 2
+ ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+ ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+ ; q6: l 2, 6,10,14 r 2, 6,10,14
+ ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+ ; q8: 1 * sinpi : c1/temp1
+ ; q9: 3 * sinpi : d1/temp2
+ ; q10: 1 * cospi
+ ; q11: 3 * cospi
+ vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q9, q7, d0[2]
+ vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q11, q7, d0[0]
+
+ vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
+ vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
+
+ ; see note on shifting above
+ vshr.s16 q10, q10, #1
+ vshr.s16 q11, q11, #1
+
+ ; q10: 1 + 1 * cospi : d1/temp1
+ ; q11: 3 + 3 * cospi : c1/temp2
+ vqadd.s16 q10, q5, q10
+ vqadd.s16 q11, q7, q11
+
+ ; q8: c1 = temp1 - temp2
+ ; q9: d1 = temp1 + temp2
+ vqsub.s16 q8, q8, q11
+ vqadd.s16 q9, q10, q9
+
+ ; a1+d1
+ ; b1+c1
+ ; b1-c1
+ ; a1-d1
+ vqadd.s16 q4, q2, q9
+ vqadd.s16 q5, q3, q8
+ vqsub.s16 q6, q3, q8
+ vqsub.s16 q7, q2, q9
+
+ ; +4 >> 3 (rounding)
+ vrshr.s16 q4, q4, #3 ; lo
+ vrshr.s16 q5, q5, #3
+ vrshr.s16 q6, q6, #3 ; hi
+ vrshr.s16 q7, q7, #3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; adding pre
+ ; input is still packed. pre was read interleaved
+ vaddw.u8 q4, q4, d28
+ vaddw.u8 q5, q5, d29
+ vaddw.u8 q6, q6, d30
+ vaddw.u8 q7, q7, d31
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ ;saturate and narrow
+ vqmovun.s16 d0, q4 ; lo
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6 ; hi
+ vqmovun.s16 d3, q7
+
+ ldr r1, [sp, #4] ; stride
+ add r2, r3, #4 ; hi
+ vst1.32 {d0[0]}, [r3], r1 ; lo
+ vst1.32 {d0[1]}, [r2], r1 ; hi
+ vst1.32 {d1[0]}, [r3], r1
+ vst1.32 {d1[1]}, [r2], r1
+ vst1.32 {d2[0]}, [r3], r1
+ vst1.32 {d2[1]}, [r2], r1
+ vst1.32 {d3[0]}, [r3]
+ vst1.32 {d3[1]}, [r2]
+
+ bx lr
+
+ ENDP ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2 DCD 0x4546
+
+ END
--- /dev/null
+++ b/vp9/decoder/asm_dec_offsets.c
@@ -1,0 +1,39 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(detok_scan, offsetof(DETOK, scan));
+DEFINE(detok_ptr_block2leftabove, offsetof(DETOK, ptr_block2leftabove));
+DEFINE(detok_coef_tree_ptr, offsetof(DETOK, vp9_coef_tree_ptr));
+DEFINE(detok_norm_ptr, offsetof(DETOK, norm_ptr));
+DEFINE(detok_ptr_coef_bands_x, offsetof(DETOK, ptr_coef_bands_x));
+
+DEFINE(detok_A, offsetof(DETOK, A));
+DEFINE(detok_L, offsetof(DETOK, L));
+
+DEFINE(detok_qcoeff_start_ptr, offsetof(DETOK, qcoeff_start_ptr));
+DEFINE(detok_coef_probs, offsetof(DETOK, coef_probs));
+DEFINE(detok_eob, offsetof(DETOK, eob));
+
+DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
--- /dev/null
+++ b/vp9/decoder/dboolhuff.c
@@ -1,0 +1,100 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz) {
+ br->user_buffer_end = source + source_sz;
+ br->user_buffer = source;
+ br->value = 0;
+ br->count = -8;
+ br->range = 255;
+
+ if (source_sz && !source)
+ return 1;
+
+ /* Populate the buffer */
+ vp9_bool_decoder_fill(br);
+
+ return 0;
+}
+
+
+void vp9_bool_decoder_fill(BOOL_DECODER *br) {
+ const unsigned char *bufptr;
+ const unsigned char *bufend;
+ VP9_BD_VALUE value;
+ int count;
+ bufend = br->user_buffer_end;
+ bufptr = br->user_buffer;
+ value = br->value;
+ count = br->count;
+
+ VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
+ br->user_buffer = bufptr;
+ br->value = value;
+ br->count = count;
+}
+
+
+static int get_unsigned_bits(unsigned num_values) {
+ int cat = 0;
+ if ((num_values--) <= 1) return 0;
+ while (num_values > 0) {
+ cat++;
+ num_values >>= 1;
+ }
+ return cat;
+}
+
+int vp9_inv_recenter_nonneg(int v, int m) {
+ if (v > (m << 1)) return v;
+ else if ((v & 1) == 0) return (v >> 1) + m;
+ else return m - ((v + 1) >> 1);
+}
+
+int vp9_decode_uniform(BOOL_DECODER *br, int n) {
+ int v;
+ int l = get_unsigned_bits(n);
+ int m = (1 << l) - n;
+ if (!l) return 0;
+ v = decode_value(br, l - 1);
+ if (v < m)
+ return v;
+ else
+ return (v << 1) - m + decode_value(br, 1);
+}
+
+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {
+ int i = 0, mk = 0, word;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ word = vp9_decode_uniform(br, num_syms - mk) + mk;
+ break;
+ } else {
+ if (decode_value(br, 1)) {
+ i++;
+ mk += a;
+ } else {
+ word = decode_value(br, b) + mk;
+ break;
+ }
+ }
+ }
+ return word;
+}
--- /dev/null
+++ b/vp9/decoder/dboolhuff.h
@@ -1,0 +1,153 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DBOOLHUFF_H
+#define DBOOLHUFF_H
+#include <stddef.h>
+#include <limits.h>
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+typedef size_t VP9_BD_VALUE;
+
+# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+ loaded as an immediate (on platforms like ARM, for example).
+ Even relatively modest values like 100 would work fine.*/
+# define VP9_LOTS_OF_BITS (0x40000000)
+
+typedef struct {
+ const unsigned char *user_buffer_end;
+ const unsigned char *user_buffer;
+ VP9_BD_VALUE value;
+ int count;
+ unsigned int range;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+int vp9_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz);
+
+void vp9_bool_decoder_fill(BOOL_DECODER *br);
+
+int vp9_decode_uniform(BOOL_DECODER *br, int n);
+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
+int vp9_inv_recenter_nonneg(int v, int m);
+
+/*The refill loop is used in several places, so define it in a macro to make
+ sure they're all consistent.
+ An inline function would be cleaner, but has a significant penalty, because
+ multiple BOOL_DECODER fields must be modified, and the compiler is not smart
+ enough to eliminate the stores to those fields and the subsequent reloads
+ from them when inlining the function.*/
+#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
+ do \
+ { \
+ int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+ int loop_end, x; \
+ size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+ \
+ x = shift + CHAR_BIT - bits_left; \
+ loop_end = 0; \
+ if(x >= 0) \
+ { \
+ (_count) += VP9_LOTS_OF_BITS; \
+ loop_end = x; \
+ if(!bits_left) break; \
+ } \
+ while(shift >= loop_end) \
+ { \
+ (_count) += CHAR_BIT; \
+ (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \
+ shift -= CHAR_BIT; \
+ } \
+ } \
+ while(0) \
+
+
+static int decode_bool(BOOL_DECODER *br, int probability) {
+ unsigned int bit = 0;
+ VP9_BD_VALUE value;
+ unsigned int split;
+ VP9_BD_VALUE bigsplit;
+ int count;
+ unsigned int range;
+
+ split = 1 + (((br->range - 1) * probability) >> 8);
+
+ if (br->count < 0)
+ vp9_bool_decoder_fill(br);
+
+ value = br->value;
+ count = br->count;
+
+ bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
+
+ range = split;
+
+ if (value >= bigsplit) {
+ range = br->range - split;
+ value = value - bigsplit;
+ bit = 1;
+ }
+
+ {
+ register unsigned int shift = vp9_norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ br->value = value;
+ br->count = count;
+ br->range = range;
+
+ return bit;
+}
+
+static int decode_value(BOOL_DECODER *br, int bits) {
+ int z = 0;
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) {
+ z |= (decode_bool(br, 0x80) << bit);
+ }
+
+ return z;
+}
+
+static int bool_error(BOOL_DECODER *br) {
+ /* Check if we have reached the end of the buffer.
+ *
+ * Variable 'count' stores the number of bits in the 'value' buffer, minus
+ * 8. The top byte is part of the algorithm, and the remainder is buffered
+ * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ * occupied, 8 for the algorithm and 8 in the buffer.
+ *
+ * When reading a byte from the user's buffer, count is filled with 8 and
+ * one byte is filled into the value buffer. When we reach the end of the
+ * data, count is additionally filled with VP9_LOTS_OF_BITS. So when
+ * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
+ */
+ if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {
+ /* We have tried to decode bits after the end of
+ * stream was encountered.
+ */
+ return 1;
+ }
+
+ /* No error. */
+ return 0;
+}
+
+#endif
--- /dev/null
+++ b/vp9/decoder/decodemv.c
@@ -1,0 +1,1199 @@
+/*
+ Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp9/common/entropymv.h"
+#include "vp9/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp9/common/findnearmv.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vp9/decoder/decodemv.h"
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+
+// #define DEBUG_DEC_MV
+#ifdef DEBUG_DEC_MV
+int dec_mvcount = 0;
+#endif
+
+static int read_bmode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_bmode_tree, p);
+}
+
+static int read_ymode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_ymode_tree, p);
+}
+
+#if CONFIG_SUPERBLOCKS
+static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_uv_mode_tree, p);
+}
+#endif
+
+static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_kf_ymode_tree, p);
+}
+
+static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_i8x8_mode_tree, p);
+}
+
+static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
+ return treed_read(bc, vp9_uv_mode_tree, p);
+}
+
+// This function reads the current macro block's segnent id from the bitstream
+// It should only be called if a segment map update is indicated.
+static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,
+ MACROBLOCKD *xd) {
+ /* Is segmentation enabled */
+ if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+ /* If so then read the segment id. */
+ if (vp9_read(r, xd->mb_segment_tree_probs[0]))
+ mi->segment_id =
+ (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));
+ else
+ mi->segment_id =
+ (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));
+ }
+}
+
+#if CONFIG_NEW_MVREF
+int vp9_read_mv_ref_id(vp9_reader *r,
+ vp9_prob * ref_id_probs) {
+ int ref_index = 0;
+
+ if (vp9_read(r, ref_id_probs[0])) {
+ ref_index++;
+ if (vp9_read(r, ref_id_probs[1])) {
+ ref_index++;
+ if (vp9_read(r, ref_id_probs[2]))
+ ref_index++;
+ }
+ }
+ return ref_index;
+}
+#endif
+
+extern const int vp9_i8x8_block[4];
+static void kfread_modes(VP9D_COMP *pbi,
+ MODE_INFO *m,
+ int mb_row,
+ int mb_col,
+ BOOL_DECODER* const bc) {
+ VP9_COMMON *const cm = &pbi->common;
+ const int mis = pbi->common.mode_info_stride;
+ int map_index = mb_row * pbi->common.mb_cols + mb_col;
+ MB_PREDICTION_MODE y_mode;
+
+ // Read the Macroblock segmentation map if it is being updated explicitly
+ // this frame (reset to 0 by default).
+ m->mbmi.segment_id = 0;
+ if (pbi->mb.update_mb_segmentation_map) {
+ read_mb_segid(bc, &m->mbmi, &pbi->mb);
+ pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
+ }
+
+ m->mbmi.mb_skip_coeff = 0;
+ if (pbi->common.mb_no_coeff_skip &&
+ (!vp9_segfeature_active(&pbi->mb,
+ m->mbmi.segment_id, SEG_LVL_EOB) ||
+ (vp9_get_segdata(&pbi->mb,
+ m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {
+ MACROBLOCKD *const xd = &pbi->mb;
+ m->mbmi.mb_skip_coeff =
+ vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+ } else {
+ if (vp9_segfeature_active(&pbi->mb,
+ m->mbmi.segment_id, SEG_LVL_EOB) &&
+ (vp9_get_segdata(&pbi->mb,
+ m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {
+ m->mbmi.mb_skip_coeff = 1;
+ } else
+ m->mbmi.mb_skip_coeff = 0;
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb) {
+ y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
+ pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+ } else
+#endif
+ y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
+ pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+#if CONFIG_COMP_INTRA_PRED
+ m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+ m->mbmi.ref_frame = INTRA_FRAME;
+
+ if ((m->mbmi.mode = y_mode) == B_PRED) {
+ int i = 0;
+#if CONFIG_COMP_INTRA_PRED
+ int use_comp_pred = vp9_read(bc, 128);
+#endif
+ do {
+ const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(m, i);
+
+ m->bmi[i].as_mode.first =
+ (B_PREDICTION_MODE) read_bmode(
+ bc, pbi->common.kf_bmode_prob [A] [L]);
+#if CONFIG_COMP_INTRA_PRED
+ if (use_comp_pred) {
+ m->bmi[i].as_mode.second =
+ (B_PREDICTION_MODE) read_bmode(
+ bc, pbi->common.kf_bmode_prob [A] [L]);
+ } else {
+ m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
+ }
+#endif
+ } while (++i < 16);
+ }
+ if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
+ int i;
+ int mode8x8;
+ for (i = 0; i < 4; i++) {
+ int ib = vp9_i8x8_block[i];
+ mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+ m->bmi[ib + 0].as_mode.first = mode8x8;
+ m->bmi[ib + 1].as_mode.first = mode8x8;
+ m->bmi[ib + 4].as_mode.first = mode8x8;
+ m->bmi[ib + 5].as_mode.first = mode8x8;
+#if CONFIG_COMP_INTRA_PRED
+ m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+ }
+ } else
+ m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
+ pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
+#if CONFIG_COMP_INTRA_PRED
+ m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb)
+ m->mbmi.txfm_size = TX_8X8;
+ else
+#endif
+ if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
+ m->mbmi.mode <= I8X8_PRED) {
+ // FIXME(rbultje) code ternary symbol once all experiments are merged
+ m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
+ if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+ m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+ } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
+ m->mbmi.txfm_size = TX_16X16;
+ } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
+ m->mbmi.txfm_size = TX_8X8;
+ } else {
+ m->mbmi.txfm_size = TX_4X4;
+ }
+}
+
+static int read_nmv_component(vp9_reader *r,
+ int rv,
+ const nmv_component *mvcomp) {
+ int v, s, z, c, o, d;
+ s = vp9_read(r, mvcomp->sign);
+ c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+ if (c == MV_CLASS_0) {
+ d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
+ } else {
+ int i, b;
+ d = 0;
+ b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i)
+ d |= (vp9_read(r, mvcomp->bits[i]) << i);
+ }
+ o = d << 3;
+
+ z = vp9_get_mv_mag(c, o);
+ v = (s ? -(z + 8) : (z + 8));
+ return v;
+}
+
+static int read_nmv_component_fp(vp9_reader *r,
+ int v,
+ int rv,
+ const nmv_component *mvcomp,
+ int usehp) {
+ int s, z, c, o, d, e, f;
+ s = v < 0;
+ z = (s ? -v : v) - 1; /* magnitude - 1 */
+ z &= ~7;
+
+ c = vp9_get_mv_class(z, &o);
+ d = o >> 3;
+
+ if (c == MV_CLASS_0) {
+ f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);
+ } else {
+ f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
+ }
+ o += (f << 1);
+
+ if (usehp) {
+ if (c == MV_CLASS_0) {
+ e = vp9_read(r, mvcomp->class0_hp);
+ } else {
+ e = vp9_read(r, mvcomp->hp);
+ }
+ o += e;
+ } else {
+ ++o; /* Note if hp is not used, the default value of the hp bit is 1 */
+ }
+ z = vp9_get_mv_mag(c, o);
+ v = (s ? -(z + 1) : (z + 1));
+ return v;
+}
+
+static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
+ const nmv_context *mvctx) {
+ MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
+ mv->row = mv-> col = 0;
+ if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+ mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
+ }
+ if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+ mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
+ }
+}
+
+static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
+ const nmv_context *mvctx, int usehp) {
+ MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+ usehp = usehp && vp9_use_nmv_hp(ref);
+ if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+ mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
+ usehp);
+ }
+ if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+ mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
+ usehp);
+ }
+ //printf(" %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);
+}
+
+static void update_nmv(vp9_reader *bc, vp9_prob *const p,
+ const vp9_prob upd_p) {
+ if (vp9_read(bc, upd_p)) {
+#ifdef LOW_PRECISION_MV_UPDATE
+ *p = (vp9_read_literal(bc, 7) << 1) | 1;
+#else
+ *p = (vp9_read_literal(bc, 8));
+#endif
+ }
+}
+
+static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
+ int usehp) {
+ int i, j, k;
+#ifdef MV_GROUP_UPDATE
+ if (!vp9_read_bit(bc)) return;
+#endif
+ for (j = 0; j < MV_JOINTS - 1; ++j) {
+ update_nmv(bc, &mvctx->joints[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (i = 0; i < 2; ++i) {
+ update_nmv(bc, &mvctx->comps[i].sign,
+ VP9_NMV_UPDATE_PROB);
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ update_nmv(bc, &mvctx->comps[i].classes[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ update_nmv(bc, &mvctx->comps[i].class0[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ update_nmv(bc, &mvctx->comps[i].bits[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ for (k = 0; k < 3; ++k)
+ update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < 3; ++j) {
+ update_nmv(bc, &mvctx->comps[i].fp[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ update_nmv(bc, &mvctx->comps[i].class0_hp,
+ VP9_NMV_UPDATE_PROB);
+ update_nmv(bc, &mvctx->comps[i].hp,
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+}
+
+// Read the referncence frame
+static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
+ vp9_reader *const bc,
+ unsigned char segment_id) {
+ MV_REFERENCE_FRAME ref_frame;
+ int seg_ref_active;
+ int seg_ref_count = 0;
+
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ seg_ref_active = vp9_segfeature_active(xd,
+ segment_id,
+ SEG_LVL_REF_FRAME);
+
+ // If segment coding enabled does the segment allow for more than one
+ // possible reference frame
+ if (seg_ref_active) {
+ seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+ vp9_check_segref(xd, segment_id, LAST_FRAME) +
+ vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+ vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+ }
+
+ // Segment reference frame features not available or allows for
+ // multiple reference frame options
+ if (!seg_ref_active || (seg_ref_count > 1)) {
+ // Values used in prediction model coding
+ unsigned char prediction_flag;
+ vp9_prob pred_prob;
+ MV_REFERENCE_FRAME pred_ref;
+
+ // Get the context probability the prediction flag
+ pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+ // Read the prediction status flag
+ prediction_flag = (unsigned char)vp9_read(bc, pred_prob);
+
+ // Store the prediction flag.
+ vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
+
+ // Get the predicted reference frame.
+ pred_ref = vp9_get_pred_ref(cm, xd);
+
+ // If correctly predicted then use the predicted value
+ if (prediction_flag) {
+ ref_frame = pred_ref;
+ }
+ // else decode the explicitly coded value
+ else {
+ vp9_prob mod_refprobs[PREDICTION_PROBS];
+ vpx_memcpy(mod_refprobs,
+ cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));
+
+ // If segment coding enabled blank out options that cant occur by
+ // setting the branch probability to 0.
+ if (seg_ref_active) {
+ mod_refprobs[INTRA_FRAME] *=
+ vp9_check_segref(xd, segment_id, INTRA_FRAME);
+ mod_refprobs[LAST_FRAME] *=
+ vp9_check_segref(xd, segment_id, LAST_FRAME);
+ mod_refprobs[GOLDEN_FRAME] *=
+ (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
+ vp9_check_segref(xd, segment_id, ALTREF_FRAME));
+ }
+
+ // Default to INTRA_FRAME (value 0)
+ ref_frame = INTRA_FRAME;
+
+ // Do we need to decode the Intra/Inter branch
+ if (mod_refprobs[0])
+ ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);
+ else
+ ref_frame++;
+
+ if (ref_frame) {
+ // Do we need to decode the Last/Gf_Arf branch
+ if (mod_refprobs[1])
+ ref_frame += vp9_read(bc, mod_refprobs[1]);
+ else
+ ref_frame++;
+
+ if (ref_frame > 1) {
+ // Do we need to decode the GF/Arf branch
+ if (mod_refprobs[2])
+ ref_frame += vp9_read(bc, mod_refprobs[2]);
+ else {
+ if (seg_ref_active) {
+ if ((pred_ref == GOLDEN_FRAME) ||
+ !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {
+ ref_frame = ALTREF_FRAME;
+ } else
+ ref_frame = GOLDEN_FRAME;
+ } else
+ ref_frame = (pred_ref == GOLDEN_FRAME)
+ ? ALTREF_FRAME : GOLDEN_FRAME;
+ }
+ }
+ }
+ }
+ }
+
+ // Segment reference frame features are enabled
+ else {
+ // The reference frame for the mb is considered as correclty predicted
+ // if it is signaled at the segment level for the purposes of the
+ // common prediction model
+ vp9_set_pred_flag(xd, PRED_REF, 1);
+ ref_frame = vp9_get_pred_ref(cm, xd);
+ }
+
+ return (MV_REFERENCE_FRAME)ref_frame;
+}
+
+#if CONFIG_SUPERBLOCKS
+static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+ return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
+}
+#endif
+
+static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+ return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
+}
+
+static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {
+ return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);
+}
+
+#ifdef VPX_MODE_COUNT
+unsigned int vp9_mv_cont_count[5][4] = {
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 },
+ { 0, 0, 0, 0 }
+};
+#endif
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
+ { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+static void read_switchable_interp_probs(VP9D_COMP* const pbi,
+ BOOL_DECODER* const bc) {
+ VP9_COMMON *const cm = &pbi->common;
+ int i, j;
+ for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
+ cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);
+ }
+ }
+ //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
+ //cm->fc.switchable_interp_prob[1]);
+}
+
+static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
+ VP9_COMMON *const cm = &pbi->common;
+ nmv_context *const nmvc = &pbi->common.fc.nmvc;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ if (cm->frame_type == KEY_FRAME) {
+ if (!cm->kf_ymode_probs_update)
+ cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
+ } else {
+#if CONFIG_PRED_FILTER
+ cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
+
+ if (cm->pred_filter_mode == 2)
+ cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
+#endif
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ read_switchable_interp_probs(pbi, bc);
+ // Decode the baseline probabilities for decoding reference frame
+ cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);
+ cm->prob_last_coded = (vp9_prob)vp9_read_literal(bc, 8);
+ cm->prob_gf_coded = (vp9_prob)vp9_read_literal(bc, 8);
+
+ // Computes a modified set of probabilities for use when reference
+ // frame prediction fails.
+ vp9_compute_mod_refprobs(cm);
+
+ pbi->common.comp_pred_mode = vp9_read(bc, 128);
+ if (cm->comp_pred_mode)
+ cm->comp_pred_mode += vp9_read(bc, 128);
+ if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+ int i;
+ for (i = 0; i < COMP_PRED_CONTEXTS; i++)
+ cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);
+ }
+
+ if (vp9_read_bit(bc)) {
+ int i = 0;
+
+ do {
+ cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
+ } while (++i < VP9_YMODES - 1);
+ }
+
+#if CONFIG_NEW_MVREF
+ // Temp defaults probabilities for ecnoding the MV ref id signal
+ vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
+#endif
+
+ read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
+ }
+}
+
+// This function either reads the segment id for the current macroblock from
+// the bitstream or if the value is temporally predicted asserts the predicted
+// value
+static void read_mb_segment_id(VP9D_COMP *pbi,
+ int mb_row, int mb_col,
+ BOOL_DECODER* const bc) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ MODE_INFO *mi = xd->mode_info_context;
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ int index = mb_row * pbi->common.mb_cols + mb_col;
+
+ if (xd->segmentation_enabled) {
+ if (xd->update_mb_segmentation_map) {
+ // Is temporal coding of the segment id for this mb enabled.
+ if (cm->temporal_update) {
+ // Get the context based probability for reading the
+ // prediction status flag
+ vp9_prob pred_prob =
+ vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
+
+ // Read the prediction status flag
+ unsigned char seg_pred_flag =
+ (unsigned char)vp9_read(bc, pred_prob);
+
+ // Store the prediction flag.
+ vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);
+
+ // If the value is flagged as correctly predicted
+ // then use the predicted value
+ if (seg_pred_flag) {
+ mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);
+ }
+ // Else .... decode it explicitly
+ else {
+ read_mb_segid(bc, mbmi, xd);
+ }
+ }
+ // Normal unpredicted coding mode
+ else {
+ read_mb_segid(bc, mbmi, xd);
+ }
+#if CONFIG_SUPERBLOCKS
+ if (mbmi->encoded_as_sb) {
+ cm->last_frame_seg_map[index] = mbmi->segment_id;
+ if (mb_col + 1 < cm->mb_cols)
+ cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
+ if (mb_row + 1 < cm->mb_rows) {
+ cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
+ if (mb_col + 1 < cm->mb_cols)
+ cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+ }
+ } else
+#endif
+ {
+ cm->last_frame_seg_map[index] = mbmi->segment_id;
+ }
+ } else {
+#if CONFIG_SUPERBLOCKS
+ if (mbmi->encoded_as_sb) {
+ mbmi->segment_id = cm->last_frame_seg_map[index];
+ if (mb_col < cm->mb_cols - 1)
+ mbmi->segment_id = mbmi->segment_id &&
+ cm->last_frame_seg_map[index + 1];
+ if (mb_row < cm->mb_rows - 1) {
+ mbmi->segment_id = mbmi->segment_id &&
+ cm->last_frame_seg_map[index + cm->mb_cols];
+ if (mb_col < cm->mb_cols - 1)
+ mbmi->segment_id = mbmi->segment_id &&
+ cm->last_frame_seg_map[index + cm->mb_cols + 1];
+ }
+ } else
+#endif
+ {
+ mbmi->segment_id = cm->last_frame_seg_map[index];
+ }
+ }
+ } else {
+ // The encoder explicitly sets the segment_id to 0
+ // when segmentation is disabled
+ mbmi->segment_id = 0;
+ }
+}
+
+static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+ MODE_INFO *prev_mi,
+ int mb_row, int mb_col,
+ BOOL_DECODER* const bc) {
+ VP9_COMMON *const cm = &pbi->common;
+ nmv_context *const nmvc = &pbi->common.fc.nmvc;
+ const int mis = pbi->common.mode_info_stride;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ int_mv *const mv = &mbmi->mv;
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+ mb_to_top_edge = xd->mb_to_top_edge;
+ mb_to_bottom_edge = xd->mb_to_bottom_edge;
+ mb_to_top_edge -= LEFT_TOP_MARGIN;
+ mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+ mbmi->need_to_clamp_mvs = 0;
+ mbmi->need_to_clamp_secondmv = 0;
+ mbmi->second_ref_frame = 0;
+ /* Distance of Mb to the various image edges.
+ * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge =
+ mb_to_left_edge = -((mb_col * 16) << 3);
+ mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+ xd->mb_to_right_edge =
+ mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
+ mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+
+ // Make sure the MACROBLOCKD mode info pointer is pointed at the
+ // correct entry for the current macroblock.
+ xd->mode_info_context = mi;
+ xd->prev_mode_info_context = prev_mi;
+
+ // Read the macroblock segment id.
+ read_mb_segment_id(pbi, mb_row, mb_col, bc);
+
+ if (pbi->common.mb_no_coeff_skip &&
+ (!vp9_segfeature_active(xd,
+ mbmi->segment_id, SEG_LVL_EOB) ||
+ (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {
+ // Read the macroblock coeff skip flag if this feature is in use,
+ // else default to 0
+ mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+ } else {
+ if (vp9_segfeature_active(xd,
+ mbmi->segment_id, SEG_LVL_EOB) &&
+ (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {
+ mbmi->mb_skip_coeff = 1;
+ } else
+ mbmi->mb_skip_coeff = 0;
+ }
+
+ // Read the reference frame
+ mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
+
+ // If reference frame is an Inter frame
+ if (mbmi->ref_frame) {
+ int rct[4];
+ int_mv nearest, nearby, best_mv;
+ int_mv nearest_second, nearby_second, best_mv_second;
+ vp9_prob mv_ref_p [VP9_MVREFS - 1];
+
+#if CONFIG_NEWBESTREFMV
+ int recon_y_stride, recon_yoffset;
+ int recon_uv_stride, recon_uvoffset;
+#endif
+
+ vp9_find_near_mvs(xd, mi,
+ prev_mi,
+ &nearest, &nearby, &best_mv, rct,
+ mbmi->ref_frame, cm->ref_frame_sign_bias);
+
+#if CONFIG_NEWBESTREFMV
+ {
+ int ref_fb_idx;
+ MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+ /* Select the appropriate reference frame for this MB */
+ if (ref_frame == LAST_FRAME)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame == GOLDEN_FRAME)
+ ref_fb_idx = cm->gld_fb_idx;
+ else
+ ref_fb_idx = cm->alt_fb_idx;
+
+ recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride ;
+ recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+
+ recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+ xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+ vp9_find_mv_refs(xd, mi, prev_mi,
+ ref_frame, mbmi->ref_mvs[ref_frame],
+ cm->ref_frame_sign_bias);
+
+ vp9_find_best_ref_mvs(xd,
+ xd->pre.y_buffer,
+ recon_y_stride,
+ mbmi->ref_mvs[ref_frame],
+ &best_mv, &nearest, &nearby);
+ }
+#endif
+
+ vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);
+
+ // Is the segment level mode feature enabled for this segment
+ if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
+ mbmi->mode =
+ vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+ } else {
+#if CONFIG_SUPERBLOCKS
+ if (mbmi->encoded_as_sb) {
+ mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+ } else
+#endif
+ mbmi->mode = read_mv_ref(bc, mv_ref_p);
+
+ vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);
+ }
+
+#if CONFIG_PRED_FILTER
+ if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
+ // Is the prediction filter enabled
+ if (cm->pred_filter_mode == 2)
+ mbmi->pred_filter_enabled =
+ vp9_read(bc, cm->prob_pred_filter_off);
+ else
+ mbmi->pred_filter_enabled = cm->pred_filter_mode;
+ }
+#endif
+ if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
+ {
+ if (cm->mcomp_filter_type == SWITCHABLE) {
+ mbmi->interp_filter = vp9_switchable_interp[
+ treed_read(bc, vp9_switchable_interp_tree,
+ vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];
+ } else {
+ mbmi->interp_filter = cm->mcomp_filter_type;
+ }
+ }
+
+ if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
+ (cm->comp_pred_mode == HYBRID_PREDICTION &&
+ vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {
+ /* Since we have 3 reference frames, we can only have 3 unique
+ * combinations of combinations of 2 different reference frames
+ * (A-G, G-L or A-L). In the bitstream, we use this to simply
+ * derive the second reference frame from the first reference
+ * frame, by saying it's the next one in the enumerator, and
+ * if that's > n_refs, then the second reference frame is the
+ * first one in the enumerator. */
+ mbmi->second_ref_frame = mbmi->ref_frame + 1;
+ if (mbmi->second_ref_frame == 4)
+ mbmi->second_ref_frame = 1;
+#if CONFIG_NEWBESTREFMV
+ if (mbmi->second_ref_frame) {
+ int second_ref_fb_idx;
+ /* Select the appropriate reference frame for this MB */
+ if (mbmi->second_ref_frame == LAST_FRAME)
+ second_ref_fb_idx = cm->lst_fb_idx;
+ else if (mbmi->second_ref_frame ==
+ GOLDEN_FRAME)
+ second_ref_fb_idx = cm->gld_fb_idx;
+ else
+ second_ref_fb_idx = cm->alt_fb_idx;
+
+ xd->second_pre.y_buffer =
+ cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+ xd->second_pre.u_buffer =
+ cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->second_pre.v_buffer =
+ cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+ vp9_find_near_mvs(xd, mi, prev_mi,
+ &nearest_second, &nearby_second, &best_mv_second,
+ rct,
+ mbmi->second_ref_frame,
+ cm->ref_frame_sign_bias);
+
+ vp9_find_mv_refs(xd, mi, prev_mi,
+ mbmi->second_ref_frame,
+ mbmi->ref_mvs[mbmi->second_ref_frame],
+ cm->ref_frame_sign_bias);
+
+ vp9_find_best_ref_mvs(xd,
+ xd->second_pre.y_buffer,
+ recon_y_stride,
+ mbmi->ref_mvs[mbmi->second_ref_frame],
+ &best_mv_second,
+ &nearest_second,
+ &nearby_second);
+ }
+#else
+ vp9_find_near_mvs(xd, mi, prev_mi,
+ &nearest_second, &nearby_second, &best_mv_second,
+ rct,
+ mbmi->second_ref_frame,
+ pbi->common.ref_frame_sign_bias);
+#endif
+ } else {
+ mbmi->second_ref_frame = 0;
+ }
+
+ mbmi->uv_mode = DC_PRED;
+ switch (mbmi->mode) {
+ case SPLITMV: {
+ const int s = mbmi->partitioning =
+ treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
+ const int num_p = vp9_mbsplit_count [s];
+ int j = 0;
+ cm->fc.mbsplit_counts[s]++;
+
+ mbmi->need_to_clamp_mvs = 0;
+ do { /* for each subset j */
+ int_mv leftmv, abovemv, second_leftmv, second_abovemv;
+ int_mv blockmv, secondmv;
+ int k; /* first block in subset j */
+ int mv_contz;
+ int blockmode;
+
+ k = vp9_mbsplit_offset[s][j];
+
+ leftmv.as_int = left_block_mv(mi, k);
+ abovemv.as_int = above_block_mv(mi, k, mis);
+ if (mbmi->second_ref_frame) {
+ second_leftmv.as_int = left_block_second_mv(mi, k);
+ second_abovemv.as_int = above_block_second_mv(mi, k, mis);
+ }
+ mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+ blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);
+ cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
+
+ switch (blockmode) {
+ case NEW4X4:
+ read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);
+ read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+ xd->allow_high_precision_mv);
+ vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ blockmv.as_mv.row += best_mv.as_mv.row;
+ blockmv.as_mv.col += best_mv.as_mv.col;
+
+ if (mbmi->second_ref_frame) {
+ read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);
+ read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+ xd->allow_high_precision_mv);
+ vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ secondmv.as_mv.row += best_mv_second.as_mv.row;
+ secondmv.as_mv.col += best_mv_second.as_mv.col;
+ }
+#ifdef VPX_MODE_COUNT
+ vp9_mv_cont_count[mv_contz][3]++;
+#endif
+ break;
+ case LEFT4X4:
+ blockmv.as_int = leftmv.as_int;
+ if (mbmi->second_ref_frame)
+ secondmv.as_int = second_leftmv.as_int;
+#ifdef VPX_MODE_COUNT
+ vp9_mv_cont_count[mv_contz][0]++;
+#endif
+ break;
+ case ABOVE4X4:
+ blockmv.as_int = abovemv.as_int;
+ if (mbmi->second_ref_frame)
+ secondmv.as_int = second_abovemv.as_int;
+#ifdef VPX_MODE_COUNT
+ vp9_mv_cont_count[mv_contz][1]++;
+#endif
+ break;
+ case ZERO4X4:
+ blockmv.as_int = 0;
+ if (mbmi->second_ref_frame)
+ secondmv.as_int = 0;
+#ifdef VPX_MODE_COUNT
+ vp9_mv_cont_count[mv_contz][2]++;
+#endif
+ break;
+ default:
+ break;
+ }
+
+ mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ if (mbmi->second_ref_frame) {
+ mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ }
+
+ {
+ /* Fill (uniform) modes, mvs of jth subset.
+ Must do it here because ensuing subsets can
+ refer back to us via "left" or "above". */
+ const unsigned char *fill_offset;
+ unsigned int fill_count = mbsplit_fill_count[s];
+
+ fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
+
+ do {
+ mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;
+ if (mbmi->second_ref_frame)
+ mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;
+ fill_offset++;
+ } while (--fill_count);
+ }
+
+ } while (++j < num_p);
+ }
+
+ mv->as_int = mi->bmi[15].as_mv.first.as_int;
+ mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;
+
+ break; /* done with SPLITMV */
+
+ case NEARMV:
+ mv->as_int = nearby.as_int;
+ /* Clip "next_nearest" so that it does not extend to far out of image */
+ clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
+ mb_to_top_edge, mb_to_bottom_edge);
+ if (mbmi->second_ref_frame) {
+ mbmi->mv[1].as_int = nearby_second.as_int;
+ clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
+ mb_to_top_edge, mb_to_bottom_edge);
+ }
+ break;
+
+ case NEARESTMV:
+ mv->as_int = nearest.as_int;
+ /* Clip "next_nearest" so that it does not extend to far out of image */
+ clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,
+ mb_to_top_edge, mb_to_bottom_edge);
+ if (mbmi->second_ref_frame) {
+ mbmi->mv[1].as_int = nearest_second.as_int;
+ clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,
+ mb_to_top_edge, mb_to_bottom_edge);
+ }
+ break;
+
+ case ZEROMV:
+ mv->as_int = 0;
+ if (mbmi->second_ref_frame)
+ mbmi->mv[1].as_int = 0;
+ break;
+
+ case NEWMV:
+
+#if CONFIG_NEW_MVREF
+ {
+ int best_index;
+ MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+ // Encode the index of the choice.
+ best_index =
+ vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
+
+ best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+ }
+#endif
+
+ read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
+ read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
+ xd->allow_high_precision_mv);
+ vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,
+ xd->allow_high_precision_mv);
+
+ mv->as_mv.row += best_mv.as_mv.row;
+ mv->as_mv.col += best_mv.as_mv.col;
+
+ /* Don't need to check this on NEARMV and NEARESTMV modes
+ * since those modes clamp the MV. The NEWMV mode does not,
+ * so signal to the prediction stage whether special
+ * handling may be required.
+ */
+ mbmi->need_to_clamp_mvs = check_mv_bounds(mv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+
+ if (mbmi->second_ref_frame) {
+#if CONFIG_NEW_MVREF
+ {
+ int best_index;
+ MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
+
+ // Encode the index of the choice.
+ best_index =
+ vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
+ best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+ }
+#endif
+
+ read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
+ read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
+ xd->allow_high_precision_mv);
+ vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,
+ &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;
+ mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;
+ mbmi->need_to_clamp_secondmv |=
+ check_mv_bounds(&mbmi->mv[1],
+ mb_to_left_edge, mb_to_right_edge,
+ mb_to_top_edge, mb_to_bottom_edge);
+ }
+ break;
+ default:
+;
+#if CONFIG_DEBUG
+ assert(0);
+#endif
+ }
+ } else {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+
+ if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))
+ mbmi->mode = (MB_PREDICTION_MODE)
+ vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+ else {
+ // FIXME write using SB mode tree
+ mbmi->mode = (MB_PREDICTION_MODE)
+ read_ymode(bc, pbi->common.fc.ymode_prob);
+ pbi->common.fc.ymode_counts[mbmi->mode]++;
+ }
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+ // If MB mode is BPRED read the block modes
+ if (mbmi->mode == B_PRED) {
+ int j = 0;
+#if CONFIG_COMP_INTRA_PRED
+ int use_comp_pred = vp9_read(bc, 128);
+#endif
+ do {
+ mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
+ /*
+ {
+ int p;
+ for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
+ printf(" %d", pbi->common.fc.bmode_prob[p]);
+ printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);
+ }
+ */
+ pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;
+#if CONFIG_COMP_INTRA_PRED
+ if (use_comp_pred) {
+ mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
+ } else {
+ mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
+ }
+#endif
+ } while (++j < 16);
+ }
+
+ if (mbmi->mode == I8X8_PRED) {
+ int i;
+ int mode8x8;
+ for (i = 0; i < 4; i++) {
+ int ib = vp9_i8x8_block[i];
+ mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
+ mi->bmi[ib + 0].as_mode.first = mode8x8;
+ mi->bmi[ib + 1].as_mode.first = mode8x8;
+ mi->bmi[ib + 4].as_mode.first = mode8x8;
+ mi->bmi[ib + 5].as_mode.first = mode8x8;
+ pbi->common.fc.i8x8_mode_counts[mode8x8]++;
+#if CONFIG_COMP_INTRA_PRED
+ mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+ }
+ } else {
+ mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
+ bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
+ pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+ }
+
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (mbmi->encoded_as_sb)
+ mbmi->txfm_size = TX_8X8;
+ else
+#endif
+ if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+ ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
+ (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
+ mbmi->partitioning == PARTITIONING_4X4)))) {
+ // FIXME(rbultje) code ternary symbol once all experiments are merged
+ mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
+ if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
+ mbmi->mode != SPLITMV)
+ mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+ } else if (cm->txfm_mode >= ALLOW_16X16 &&
+ ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+ (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (cm->txfm_mode >= ALLOW_8X8 &&
+ (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&
+ !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
+ mbmi->partitioning == PARTITIONING_4X4))) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
+}
+
+void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
+ VP9_COMMON *cm = &pbi->common;
+
+ vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
+ if (pbi->common.mb_no_coeff_skip) {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);
+ }
+
+ mb_mode_mv_init(pbi, bc);
+}
+void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,
+ MACROBLOCKD *xd,
+ int mb_row,
+ int mb_col,
+ BOOL_DECODER* const bc) {
+ MODE_INFO *mi = xd->mode_info_context;
+ MODE_INFO *prev_mi = xd->prev_mode_info_context;
+
+ if (pbi->common.frame_type == KEY_FRAME)
+ kfread_modes(pbi, mi, mb_row, mb_col, bc);
+ else
+ read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
+}
--- /dev/null
+++ b/vp9/decoder/decodemv.h
@@ -1,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
+ MACROBLOCKD* const xd,
+ int mb_row,
+ int mb_col,
+ BOOL_DECODER* const bc);
+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
--- /dev/null
+++ b/vp9/decoder/decodframe.c
@@ -1,0 +1,1337 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+#include "vp9/common/header.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "vp9/common/reconinter.h"
+#include "detokenize.h"
+#include "vp9/common/invtrans.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/yv12extend.h"
+#include "vp9/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/modecont.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/idct.h"
+#include "dboolhuff.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/entropy.h"
+#include "vpx_rtcd.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+
+#define COEFCOUNT_TESTING
+
+static int merge_index(int v, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (v < max1) v = v * modulus + modulus / 2;
+ else {
+ int w;
+ v -= max1;
+ w = v;
+ v += (v + modulus - modulus / 2) / modulus;
+ while (v % modulus == modulus / 2 ||
+ w != v - (v + modulus - modulus / 2) / modulus) v++;
+ }
+ return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+ const int n = 256;
+ const int modulus = MODULUS_PARAM;
+ int i;
+ v = merge_index(v, n - 1, modulus);
+ if ((m << 1) <= n) {
+ i = vp9_inv_recenter_nonneg(v + 1, m);
+ } else {
+ i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
+ }
+ return i;
+}
+
+static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
+ int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);
+ return (vp9_prob)inv_remap_prob(delp, oldp);
+}
+
+void vp9_init_de_quantizer(VP9D_COMP *pbi) {
+ int i;
+ int Q;
+ VP9_COMMON *const pc = &pbi->common;
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++) {
+ pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
+ pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
+ pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+ /* all the ac values =; */
+ for (i = 1; i < 16; i++) {
+ int rc = vp9_default_zig_zag1d[i];
+
+ pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
+ pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
+ pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
+ }
+ }
+}
+
+static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+ int i;
+ int QIndex;
+ VP9_COMMON *const pc = &pbi->common;
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+ // Set the Q baseline allowing for any segment level adjustment
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+ /* Abs Value */
+ if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
+ QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+ /* Delta Value */
+ else {
+ QIndex = pc->base_qindex +
+ vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
+ }
+ } else
+ QIndex = pc->base_qindex;
+ xd->q_index = QIndex;
+
+ /* Set up the block level dequant pointers */
+ for (i = 0; i < 16; i++) {
+ xd->block[i].dequant = pc->Y1dequant[QIndex];
+ }
+
+#if CONFIG_LOSSLESS
+ if (!QIndex) {
+ pbi->common.rtcd.idct.idct1 = vp9_short_inv_walsh4x4_1_x8_c;
+ pbi->common.rtcd.idct.idct16 = vp9_short_inv_walsh4x4_x8_c;
+ pbi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_inv_walsh_add_c;
+ pbi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_lossless_c;
+ pbi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_lossless_c;
+ pbi->idct_add = vp9_dequant_idct_add_lossless_c;
+ pbi->dc_idct_add = vp9_dequant_dc_idct_add_lossless_c;
+ pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
+ pbi->idct_add_y_block = vp9_dequant_idct_add_y_block_lossless_c;
+ pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;
+ } else {
+ pbi->common.rtcd.idct.idct1 = vp9_short_idct4x4llm_1_c;
+ pbi->common.rtcd.idct.idct16 = vp9_short_idct4x4llm_c;
+ pbi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
+ pbi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
+ pbi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_c;
+ pbi->idct_add = vp9_dequant_idct_add;
+ pbi->dc_idct_add = vp9_dequant_dc_idct_add;
+ pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
+ pbi->idct_add_y_block = vp9_dequant_idct_add_y_block;
+ pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block;
+ }
+#else
+ pbi->idct_add = vp9_dequant_idct_add;
+ pbi->dc_idct_add = vp9_dequant_dc_idct_add;
+ pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
+ pbi->idct_add_y_block = vp9_dequant_idct_add_y_block;
+ pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block;
+#endif
+
+ for (i = 16; i < 24; i++) {
+ xd->block[i].dequant = pc->UVdequant[QIndex];
+ }
+
+ xd->block[24].dequant = pc->Y2dequant[QIndex];
+
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
+
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ vp9_build_intra_predictors_sbuv_s(xd);
+ vp9_build_intra_predictors_sby_s(xd);
+ } else {
+#endif
+ vp9_build_intra_predictors_mbuv_s(xd);
+ vp9_build_intra_predictors_mby_s(xd);
+#if CONFIG_SUPERBLOCKS
+ }
+#endif
+ } else {
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ } else {
+#endif
+ vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ }
+#if CONFIG_SUPERBLOCKS
+ }
+#endif
+ }
+}
+
+static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
+ int mb_row, unsigned int mb_col,
+ BOOL_DECODER* const bc) {
+ int eobtotal = 0;
+ MB_PREDICTION_MODE mode;
+ int i;
+ int tx_size;
+ TX_TYPE tx_type;
+ VP9_COMMON *pc = &pbi->common;
+#if CONFIG_SUPERBLOCKS
+ int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+#endif
+
+ // re-initialize macroblock dequantizer before detokenization
+ if (xd->segmentation_enabled)
+ mb_init_dequantizer(pbi, xd);
+
+ tx_size = xd->mode_info_context->mbmi.txfm_size;
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+ vp9_reset_mb_tokens_context(xd);
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb &&
+ (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
+ if (mb_col < pc->mb_cols - 1)
+ xd->above_context++;
+ if (mb_row < pc->mb_rows - 1)
+ xd->left_context++;
+ vp9_reset_mb_tokens_context(xd);
+ if (mb_col < pc->mb_cols - 1)
+ xd->above_context--;
+ if (mb_row < pc->mb_rows - 1)
+ xd->left_context--;
+ }
+#endif
+ } else if (!bool_error(bc)) {
+ for (i = 0; i < 25; i++) {
+ xd->block[i].eob = 0;
+ xd->eobs[i] = 0;
+ }
+ if (tx_size == TX_16X16) {
+ eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
+ } else if (tx_size == TX_8X8) {
+ eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+ } else {
+ eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+ }
+ }
+
+ //mode = xd->mode_info_context->mbmi.mode;
+ if (pbi->common.frame_type != KEY_FRAME)
+ vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
+ &pbi->common);
+
+ if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV
+ && mode != I8X8_PRED
+ && !bool_error(bc)) {
+ /* Special case: Force the loopfilter to skip when eobtotal and
+ * mb_skip_coeff are zero.
+ * */
+ xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
+#if CONFIG_SUPERBLOCKS
+ if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
+#endif
+ {
+ skip_recon_mb(pbi, xd);
+ return;
+ }
+ }
+
+ // moved to be performed before detokenization
+// if (xd->segmentation_enabled)
+// mb_init_dequantizer(pbi, xd);
+
+ /* do prediction */
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ vp9_build_intra_predictors_sby_s(xd);
+ vp9_build_intra_predictors_sbuv_s(xd);
+ } else
+#endif
+ if (mode != I8X8_PRED) {
+ vp9_build_intra_predictors_mbuv(xd);
+ if (mode != B_PRED) {
+ vp9_build_intra_predictors_mby(xd);
+ }
+ }
+ } else {
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ } else
+#endif
+ vp9_build_inter_predictors_mb(xd);
+ }
+
+ /* dequantization and idct */
+ if (mode == I8X8_PRED) {
+ for (i = 0; i < 4; i++) {
+ int ib = vp9_i8x8_block[i];
+ const int iblock[4] = {0, 1, 4, 5};
+ int j;
+ int i8x8mode;
+ BLOCKD *b;
+
+ int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+ short *q = xd->block[idx].qcoeff;
+ short *dq = xd->block[0].dequant;
+ unsigned char *pre = xd->block[ib].predictor;
+ unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
+ int stride = xd->dst.y_stride;
+
+ b = &xd->block[ib];
+ i8x8mode = b->bmi.as_mode.first;
+ vp9_intra8x8_predict(b, i8x8mode, b->predictor);
+
+ if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+ tx_type = get_tx_type(xd, &xd->block[idx]);
+ if (tx_type != DCT_DCT) {
+ vp9_ht_dequant_idct_add_8x8_c(tx_type,
+ q, dq, pre, dst, 16, stride);
+ } else {
+ vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+ }
+ q += 64;
+ } else {
+ for (j = 0; j < 4; j++) {
+ b = &xd->block[ib + iblock[j]];
+ vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
+ *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ }
+ b = &xd->block[16 + i];
+ vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
+ pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+ *(b->base_dst) + b->dst, 8, b->dst_stride);
+ b = &xd->block[20 + i];
+ vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);
+ pbi->idct_add(b->qcoeff, b->dequant, b->predictor,
+ *(b->base_dst) + b->dst, 8, b->dst_stride);
+ }
+ } else if (mode == B_PRED) {
+ for (i = 0; i < 16; i++) {
+ BLOCKD *b = &xd->block[i];
+ int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
+
+ if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+ vp9_intra4x4_predict(b, b_mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
+ }
+#endif
+
+ tx_type = get_tx_type(xd, b);
+ if (tx_type != DCT_DCT) {
+ vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
+ b->dequant, b->predictor,
+ *(b->base_dst) + b->dst, 16, b->dst_stride);
+ } else {
+ vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
+ *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ }
+ } else if (mode == SPLITMV) {
+ if (tx_size == TX_8X8) {
+ vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs, xd);
+ } else {
+ pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+ } else {
+ BLOCKD *b = &xd->block[24];
+
+ if (tx_size == TX_16X16) {
+ BLOCKD *bd = &xd->block[0];
+ tx_type = get_tx_type(xd, bd);
+ if (tx_type != DCT_DCT) {
+ vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
+ xd->block[0].dequant, xd->predictor,
+ xd->dst.y_buffer, 16, xd->dst.y_stride);
+ } else {
+ vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ 16, xd->dst.y_stride);
+ }
+ } else if (tx_size == TX_8X8) {
+#if CONFIG_SUPERBLOCKS
+ void *orig = xd->mode_info_context;
+ int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
+ for (n = 0; n < num; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+ if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
+ mb_row + y_idx >= pc->mb_rows))
+ continue;
+
+ if (n != 0) {
+ for (i = 0; i < 25; i++) {
+ xd->block[i].eob = 0;
+ xd->eobs[i] = 0;
+ }
+ xd->above_context = pc->above_context + mb_col + (n & 1);
+ xd->left_context = pc->left_context + (n >> 1);
+ xd->mode_info_context = orig;
+ xd->mode_info_context += (n & 1);
+ xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
+ if (!orig_skip_flag) {
+ eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+ if (eobtotal == 0) // skip loopfilter
+ xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+ } else {
+ vp9_reset_mb_tokens_context(xd);
+ }
+ }
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ continue; // only happens for SBs, which are already in dest buffer
+#endif
+ vp9_dequantize_b_2x2(b);
+ IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+ ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+ xd->block[0].dequant,
+ xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
+ xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+ // do UV inline also
+ vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+ xd->block[16].dequant,
+ xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+ xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+ xd->dst.uv_stride, xd->eobs + 16, xd);
+ } else
+#endif
+ vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
+ xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+ }
+ xd->mode_info_context = orig;
+#endif
+ } else {
+ vp9_dequantize_b(b);
+ if (xd->eobs[24] > 1) {
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+ } else {
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+
+ pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,
+ xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,
+ xd->block[24].diff);
+ }
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+ if ((tx_size == TX_8X8 &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV)
+ || tx_size == TX_16X16
+ )
+ vp9_dequant_idct_add_uv_block_8x8
+ (xd->qcoeff + 16 * 16, xd->block[16].dequant,
+ xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs + 16, xd); //
+ else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
+ pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+ xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs + 16);
+#if CONFIG_SUPERBLOCKS
+ }
+#endif
+}
+
+
+static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
+ int ret_val = 0;
+
+ if (vp9_read_bit(bc)) {
+ ret_val = vp9_read_literal(bc, 4);
+
+ if (vp9_read_bit(bc))
+ ret_val = -ret_val;
+ }
+
+ /* Trigger a quantizer update if the delta-q value has changed */
+ if (ret_val != prev)
+ *q_update = 1;
+
+ return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+/* Decode a row of Superblocks (2x2 region of MBs) */
+static void
+decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
+ BOOL_DECODER* const bc) {
+ int i;
+ int sb_col;
+ int mb_row, mb_col;
+ int recon_yoffset, recon_uvoffset;
+ int ref_fb_idx = pc->lst_fb_idx;
+ int dst_fb_idx = pc->new_fb_idx;
+ int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
+ int row_delta[4] = { 0, +1, 0, -1};
+ int col_delta[4] = { +1, -1, +1, +1};
+ int sb_cols = (pc->mb_cols + 1) >> 1;
+
+ // For a SB there are 2 left contexts, each pertaining to a MB row within
+ vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+
+ mb_row = mbrow;
+ mb_col = 0;
+
+ for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+ MODE_INFO *mi = xd->mode_info_context;
+
+#if CONFIG_SUPERBLOCKS
+ mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
+#endif
+
+ // Process the 4 MBs within the SB in the order:
+ // top-left, top-right, bottom-left, bottom-right
+ for (i = 0; i < 4; i++) {
+ int dy = row_delta[i];
+ int dx = col_delta[i];
+ int offset_extended = dy * xd->mode_info_stride + dx;
+
+ xd->mb_index = i;
+
+ mi = xd->mode_info_context;
+ if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
+ // MB lies outside frame, skip on to next
+ mb_row += dy;
+ mb_col += dx;
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+ continue;
+ }
+
+ // Set above context pointer
+ xd->above_context = pc->above_context + mb_col;
+ xd->left_context = pc->left_context + (i >> 1);
+
+ /* Distance of Mb to the various image edges.
+ * These are specified to 8th pel as they are always compared to
+ * values that are in 1/8th pel units
+ */
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+ xd->up_available = (mb_row != 0);
+ xd->left_available = (mb_col != 0);
+
+
+ recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+ xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+#if CONFIG_SUPERBLOCKS
+ if (i)
+ mi->mbmi.encoded_as_sb = 0;
+#endif
+ vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
+
+ update_blockd_bmi(xd);
+
+ /* Select the appropriate reference frame for this MB */
+ if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+ ref_fb_idx = pc->lst_fb_idx;
+ else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+ ref_fb_idx = pc->gld_fb_idx;
+ else
+ ref_fb_idx = pc->alt_fb_idx;
+
+ xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+ xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ int second_ref_fb_idx;
+
+ /* Select the appropriate reference frame for this MB */
+ if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+ second_ref_fb_idx = pc->lst_fb_idx;
+ else if (xd->mode_info_context->mbmi.second_ref_frame ==
+ GOLDEN_FRAME)
+ second_ref_fb_idx = pc->gld_fb_idx;
+ else
+ second_ref_fb_idx = pc->alt_fb_idx;
+
+ xd->second_pre.y_buffer =
+ pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+ xd->second_pre.u_buffer =
+ pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->second_pre.v_buffer =
+ pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+ }
+
+ if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+ /* propagate errors from reference frames */
+ xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ if (mb_col < pc->mb_cols - 1)
+ mi[1] = mi[0];
+ if (mb_row < pc->mb_rows - 1) {
+ mi[pc->mode_info_stride] = mi[0];
+ if (mb_col < pc->mb_cols - 1)
+ mi[pc->mode_info_stride + 1] = mi[0];
+ }
+ }
+#endif
+ vp9_intra_prediction_down_copy(xd);
+ decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+
+ /* check if the boolean decoder has suffered an error */
+ xd->corrupted |= bool_error(bc);
+
+#if CONFIG_SUPERBLOCKS
+ if (mi->mbmi.encoded_as_sb) {
+ assert(!i);
+ mb_col += 2;
+ xd->mode_info_context += 2;
+ xd->prev_mode_info_context += 2;
+ break;
+ }
+#endif
+
+ // skip to next MB
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+ mb_row += dy;
+ mb_col += dx;
+ }
+ }
+
+ /* skip prediction column */
+ xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
+ xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
+}
+
+static unsigned int read_partition_size(const unsigned char *cx_size) {
+ const unsigned int size =
+ cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+ return size;
+}
+
+static int read_is_valid(const unsigned char *start,
+ size_t len,
+ const unsigned char *end) {
+ return (start + len > start && start + len <= end);
+}
+
+
+static void setup_token_decoder(VP9D_COMP *pbi,
+ const unsigned char *cx_data,
+ BOOL_DECODER* const bool_decoder) {
+ VP9_COMMON *pc = &pbi->common;
+ const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
+ const unsigned char *partition;
+
+ ptrdiff_t partition_size;
+ ptrdiff_t bytes_left;
+
+ // Set up pointers to token partition
+ partition = cx_data;
+ bytes_left = user_data_end - partition;
+ partition_size = bytes_left;
+
+ /* Validate the calculated partition length. If the buffer
+ * described by the partition can't be fully read, then restrict
+ * it to the portion that can be (for EC mode) or throw an error.
+ */
+ if (!read_is_valid(partition, partition_size, user_data_end)) {
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition "
+ "%d length", 1);
+ }
+
+ if (vp9_start_decode(bool_decoder, partition, partition_size))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder %d", 1);
+}
+
+static void init_frame(VP9D_COMP *pbi) {
+ VP9_COMMON *const pc = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ if (pc->frame_type == KEY_FRAME) {
+ /* Various keyframe initializations */
+ vp9_init_mv_probs(pc);
+
+ vp9_init_mbmode_probs(pc);
+ vp9_default_bmode_probs(pc->fc.bmode_prob);
+
+ vp9_default_coef_probs(pc);
+ vp9_kf_default_bmode_probs(pc->kf_bmode_prob);
+
+ // Reset the segment feature data to the default stats:
+ // Features disabled, 0, with delta coding (Default state).
+ vp9_clearall_segfeatures(xd);
+
+ xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+ /* reset the mode ref deltasa for loop filter */
+ vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+ /* All buffers are implicitly updated on key frames. */
+ pc->refresh_golden_frame = 1;
+ pc->refresh_alt_ref_frame = 1;
+ pc->copy_buffer_to_gf = 0;
+ pc->copy_buffer_to_arf = 0;
+
+ /* Note that Golden and Altref modes cannot be used on a key frame so
+ * ref_frame_sign_bias[] is undefined and meaningless
+ */
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+ vp9_init_mode_contexts(&pbi->common);
+ vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
+
+ vpx_memcpy(pbi->common.fc.vp8_mode_contexts,
+ pbi->common.fc.mode_context,
+ sizeof(pbi->common.fc.mode_context));
+ vpx_memset(pc->prev_mip, 0,
+ (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
+ vpx_memset(pc->mip, 0,
+ (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));
+
+ vp9_update_mode_info_border(pc, pc->mip);
+ vp9_update_mode_info_in_image(pc, pc->mi);
+
+ } else {
+
+ if (!pc->use_bilinear_mc_filter)
+ pc->mcomp_filter_type = EIGHTTAP;
+ else
+ pc->mcomp_filter_type = BILINEAR;
+
+ /* To enable choice of different interpolation filters */
+ vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+ }
+
+ xd->mode_info_context = pc->mi;
+ xd->prev_mode_info_context = pc->prev_mi;
+ xd->frame_type = pc->frame_type;
+ xd->mode_info_context->mbmi.mode = DC_PRED;
+ xd->mode_info_stride = pc->mode_info_stride;
+ xd->corrupted = 0; /* init without corruption */
+
+ xd->fullpixel_mask = 0xffffffff;
+ if (pc->full_pixel)
+ xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+#if 0
+static void read_coef_probs2(VP9D_COMP *pbi) {
+ const vp9_prob grpupd = 192;
+ int i, j, k, l;
+ vp9_reader *const bc = &pbi->bc;
+ VP9_COMMON *const pc = &pbi->common;
+ for (l = 0; l < ENTROPY_NODES; l++) {
+ if (vp9_read(bc, grpupd)) {
+ // printf("Decoding %d\n", l);
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = !i; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ if (k >= 3 && ((i == 0 && j == 1) ||
+ (i > 0 && j == 0)))
+ continue;
+ {
+ vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+ int u = vp9_read(bc, COEF_UPDATE_PROB);
+ if (u) *p = read_prob_diff_update(bc, *p);
+ }
+ }
+ }
+ }
+ if (pbi->common.txfm_mode == ALLOW_8X8) {
+ for (l = 0; l < ENTROPY_NODES; l++) {
+ if (vp9_read(bc, grpupd)) {
+ for (i = 0; i < BLOCK_TYPES_8X8; i++)
+ for (j = !i; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ if (k >= 3 && ((i == 0 && j == 1) ||
+ (i > 0 && j == 0)))
+ continue;
+ {
+ vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;
+
+ int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);
+ if (u) *p = read_prob_diff_update(bc, *p);
+ }
+ }
+ }
+ }
+ }
+}
+#endif
+
+static void read_coef_probs_common(
+ BOOL_DECODER* const bc,
+ vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+ int i, j, k, l;
+
+ if (vp9_read_bit(bc)) {
+ for (i = 0; i < BLOCK_TYPES; i++) {
+ for (j = !i; j < COEF_BANDS; j++) {
+ /* NB: This j loop starts from 1 on block type i == 0 */
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ if (k >= 3 && ((i == 0 && j == 1) ||
+ (i > 0 && j == 0)))
+ continue;
+ for (l = 0; l < ENTROPY_NODES; l++) {
+ vp9_prob *const p = coef_probs[i][j][k] + l;
+
+ if (vp9_read(bc, COEF_UPDATE_PROB)) {
+ *p = read_prob_diff_update(bc, *p);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
+ VP9_COMMON *const pc = &pbi->common;
+
+ read_coef_probs_common(bc, pc->fc.coef_probs);
+ read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
+
+ if (pbi->common.txfm_mode != ONLY_4X4) {
+ read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
+ read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
+ }
+ if (pbi->common.txfm_mode > ALLOW_8X8) {
+ read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
+ read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
+ }
+}
+
+int vp9_decode_frame(VP9D_COMP *pbi) {
+ BOOL_DECODER header_bc, residual_bc;
+ VP9_COMMON *const pc = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const unsigned char *data = (const unsigned char *)pbi->Source;
+ const unsigned char *data_end = data + pbi->source_sz;
+ ptrdiff_t first_partition_length_in_bytes = 0;
+
+ int mb_row;
+ int i, j;
+ int corrupt_tokens = 0;
+
+ /* start with no corruption of current frame */
+ xd->corrupted = 0;
+ pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+
+ if (data_end - data < 3) {
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet");
+ } else {
+ pc->last_frame_type = pc->frame_type;
+ pc->frame_type = (FRAME_TYPE)(data[0] & 1);
+ pc->version = (data[0] >> 1) & 7;
+ pc->show_frame = (data[0] >> 4) & 1;
+ first_partition_length_in_bytes =
+ (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+
+ if ((data + first_partition_length_in_bytes > data_end
+ || data + first_partition_length_in_bytes < data))
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition 0 length");
+
+ data += 3;
+
+ vp9_setup_version(pc);
+
+ if (pc->frame_type == KEY_FRAME) {
+ const int Width = pc->Width;
+ const int Height = pc->Height;
+
+ /* vet via sync code */
+ /* When error concealment is enabled we should only check the sync
+ * code if we have enough bits available
+ */
+ if (data + 3 < data_end) {
+ if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+ vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+ "Invalid frame sync code");
+ }
+
+ /* If error concealment is enabled we should only parse the new size
+ * if we have enough data. Otherwise we will end up with the wrong
+ * size.
+ */
+ if (data + 6 < data_end) {
+ pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
+ pc->horiz_scale = data[4] >> 6;
+ pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
+ pc->vert_scale = data[6] >> 6;
+ }
+ data += 7;
+
+ if (Width != pc->Width || Height != pc->Height) {
+ if (pc->Width <= 0) {
+ pc->Width = Width;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame width");
+ }
+
+ if (pc->Height <= 0) {
+ pc->Height = Height;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame height");
+ }
+
+ if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+ }
+ }
+ }
+
+ if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
+ pc->Width == 0 || pc->Height == 0) {
+ return -1;
+ }
+
+ init_frame(pbi);
+
+ if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder 0");
+ if (pc->frame_type == KEY_FRAME) {
+ pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
+ pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
+ }
+
+ /* Is segmentation enabled */
+ xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);
+
+ if (xd->segmentation_enabled) {
+ // Read whether or not the segmentation map is being explicitly
+ // updated this frame.
+ xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);
+
+ // If so what method will be used.
+ if (xd->update_mb_segmentation_map) {
+ // Which macro block level features are enabled
+
+ // Read the probs used to decode the segment id for each macro
+ // block.
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+ xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?
+ (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
+ }
+
+ // Read the prediction probs needed to decode the segment id
+ pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ if (pc->temporal_update) {
+ pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?
+ (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;
+ } else {
+ pc->segment_pred_probs[i] = 255;
+ }
+ }
+ }
+ // Is the segment data being updated
+ xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);
+
+ if (xd->update_mb_segmentation_data) {
+ int data;
+
+ xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);
+
+ vp9_clearall_segfeatures(xd);
+
+ // For each segmentation...
+ for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ // For each of the segments features...
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ // Is the feature enabled
+ if (vp9_read_bit(&header_bc)) {
+ // Update the feature data and mask
+ vp9_enable_segfeature(xd, i, j);
+
+ data = (signed char)vp9_read_literal(
+ &header_bc, vp9_seg_feature_data_bits(j));
+
+ // Is the segment data signed..
+ if (vp9_is_segfeature_signed(j)) {
+ if (vp9_read_bit(&header_bc))
+ data = - data;
+ }
+ } else
+ data = 0;
+
+ vp9_set_segdata(xd, i, j, data);
+ }
+ }
+ }
+ }
+
+ // Read common prediction model status flag probability updates for the
+ // reference frame
+ if (pc->frame_type == KEY_FRAME) {
+ // Set the prediction probabilities to defaults
+ pc->ref_pred_probs[0] = 120;
+ pc->ref_pred_probs[1] = 80;
+ pc->ref_pred_probs[2] = 40;
+ } else {
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ if (vp9_read_bit(&header_bc))
+ pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);
+ }
+ }
+
+#if CONFIG_SUPERBLOCKS
+ pc->sb_coded = vp9_read_literal(&header_bc, 8);
+#endif
+
+ /* Read the loop filter level and type */
+ pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+ if (pc->txfm_mode == TX_MODE_SELECT) {
+ pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
+ pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+ }
+
+ pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
+ pc->filter_level = vp9_read_literal(&header_bc, 6);
+ pc->sharpness_level = vp9_read_literal(&header_bc, 3);
+
+ /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+ xd->mode_ref_lf_delta_update = 0;
+ xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);
+
+ if (xd->mode_ref_lf_delta_enabled) {
+ /* Do the deltas need to be updated */
+ xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);
+
+ if (xd->mode_ref_lf_delta_update) {
+ /* Send update */
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+ if (vp9_read_bit(&header_bc)) {
+ /*sign = vp9_read_bit( &header_bc );*/
+ xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
+
+ if (vp9_read_bit(&header_bc)) /* Apply sign */
+ xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+ }
+ }
+
+ /* Send update */
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ if (vp9_read_bit(&header_bc)) {
+ /*sign = vp9_read_bit( &header_bc );*/
+ xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
+
+ if (vp9_read_bit(&header_bc)) /* Apply sign */
+ xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+ }
+ }
+ }
+ }
+
+ // Dummy read for now
+ vp9_read_literal(&header_bc, 2);
+
+ setup_token_decoder(pbi, data + first_partition_length_in_bytes,
+ &residual_bc);
+
+ /* Read the default quantizers. */
+ {
+ int Q, q_update;
+
+ Q = vp9_read_literal(&header_bc, QINDEX_BITS);
+ pc->base_qindex = Q;
+ q_update = 0;
+ /* AC 1st order Q = default */
+ pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
+ pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);
+ pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);
+ pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
+ pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
+
+ if (q_update)
+ vp9_init_de_quantizer(pbi);
+
+ /* MB level dequantizer setup */
+ mb_init_dequantizer(pbi, &pbi->mb);
+ }
+
+ /* Determine if the golden frame or ARF buffer should be updated and how.
+ * For all non key frames the GF and ARF refresh flags and sign bias
+ * flags must be set explicitly.
+ */
+ if (pc->frame_type != KEY_FRAME) {
+ /* Should the GF or ARF be updated from the current frame */
+ pc->refresh_golden_frame = vp9_read_bit(&header_bc);
+ pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);
+
+ if (pc->refresh_alt_ref_frame) {
+ vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));
+ vpx_memcpy(pc->fc.vp8_mode_contexts,
+ pc->fc.mode_context_a,
+ sizeof(pc->fc.vp8_mode_contexts));
+ } else {
+ vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+ vpx_memcpy(pc->fc.vp8_mode_contexts,
+ pc->fc.mode_context,
+ sizeof(pc->fc.vp8_mode_contexts));
+ }
+
+ /* Buffer to buffer copy flags. */
+ pc->copy_buffer_to_gf = 0;
+
+ if (!pc->refresh_golden_frame)
+ pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);
+
+ pc->copy_buffer_to_arf = 0;
+
+ if (!pc->refresh_alt_ref_frame)
+ pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);
+
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
+
+ /* Is high precision mv allowed */
+ xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
+ // Read the type of subpel filter to use
+ if (vp9_read_bit(&header_bc)) {
+ pc->mcomp_filter_type = SWITCHABLE;
+ } else {
+ pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
+ }
+ /* To enable choice of different interploation filters */
+ vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+ }
+
+ pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
+ if (pc->refresh_entropy_probs == 0) {
+ vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ }
+
+ pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)
+ || vp9_read_bit(&header_bc);
+
+ if (0) {
+ FILE *z = fopen("decodestats.stt", "a");
+ fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+ pc->current_video_frame,
+ pc->frame_type,
+ pc->refresh_golden_frame,
+ pc->refresh_alt_ref_frame,
+ pc->refresh_last_frame,
+ pc->base_qindex);
+ fclose(z);
+ }
+
+ vp9_copy(pbi->common.fc.pre_coef_probs,
+ pbi->common.fc.coef_probs);
+ vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
+ pbi->common.fc.hybrid_coef_probs);
+ vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
+ pbi->common.fc.coef_probs_8x8);
+ vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
+ pbi->common.fc.hybrid_coef_probs_8x8);
+ vp9_copy(pbi->common.fc.pre_coef_probs_16x16,
+ pbi->common.fc.coef_probs_16x16);
+ vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
+ pbi->common.fc.hybrid_coef_probs_16x16);
+ vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
+ vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
+ vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
+ vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
+ vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);
+ vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);
+ pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
+ vp9_zero(pbi->common.fc.coef_counts);
+ vp9_zero(pbi->common.fc.hybrid_coef_counts);
+ vp9_zero(pbi->common.fc.coef_counts_8x8);
+ vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
+ vp9_zero(pbi->common.fc.coef_counts_16x16);
+ vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+ vp9_zero(pbi->common.fc.ymode_counts);
+ vp9_zero(pbi->common.fc.uv_mode_counts);
+ vp9_zero(pbi->common.fc.bmode_counts);
+ vp9_zero(pbi->common.fc.i8x8_mode_counts);
+ vp9_zero(pbi->common.fc.sub_mv_ref_counts);
+ vp9_zero(pbi->common.fc.mbsplit_counts);
+ vp9_zero(pbi->common.fc.NMVcount);
+ vp9_zero(pbi->common.fc.mv_ref_ct);
+ vp9_zero(pbi->common.fc.mv_ref_ct_a);
+
+ read_coef_probs(pbi, &header_bc);
+
+ vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+ vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
+
+ // Create the segmentation map structure and set to 0
+ if (!pc->last_frame_seg_map)
+ CHECK_MEM_ERROR(pc->last_frame_seg_map,
+ vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
+
+ /* set up frame new frame for intra coded blocks */
+ vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+
+ vp9_setup_block_dptrs(xd);
+
+ vp9_build_block_doffsets(xd);
+
+ /* clear out the coeff buffer */
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+ /* Read the mb_no_coeff_skip flag */
+ pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);
+
+ vp9_decode_mode_mvs_init(pbi, &header_bc);
+
+ vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+
+ // Resset the macroblock mode info context to the start of the list
+ xd->mode_info_context = pc->mi;
+ xd->prev_mode_info_context = pc->prev_mi;
+
+ /* Decode a row of superblocks */
+ for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
+ decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
+ }
+ corrupt_tokens |= xd->corrupted;
+
+ /* Collect information about decoder corruption. */
+ /* 1. Check first boolean decoder for errors. */
+ pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
+ /* 2. Check the macroblock information */
+ pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
+
+ if (!pbi->decoded_key_frame) {
+ if (pc->frame_type == KEY_FRAME &&
+ !pc->yv12_fb[pc->new_fb_idx].corrupted)
+ pbi->decoded_key_frame = 1;
+ else
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+ "A stream must start with a complete key frame");
+ }
+
+ vp9_adapt_coef_probs(pc);
+ if (pc->frame_type != KEY_FRAME) {
+ vp9_adapt_mode_probs(pc);
+ vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+ vp9_update_mode_context(&pbi->common);
+ }
+
+ /* If this was a kf or Gf note the Q used */
+ if ((pc->frame_type == KEY_FRAME) ||
+ pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {
+ pc->last_kf_gf_q = pc->base_qindex;
+ }
+ if (pc->refresh_entropy_probs) {
+ if (pc->refresh_alt_ref_frame)
+ vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));
+ else
+ vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ }
+
+#ifdef PACKET_TESTING
+ {
+ FILE *f = fopen("decompressor.VP8", "ab");
+ unsigned int size = residual_bc.pos + header_bc.pos + 8;
+ fwrite((void *) &size, 4, 1, f);
+ fwrite((void *) pbi->Source, size, 1, f);
+ fclose(f);
+ }
+#endif
+ // printf("Frame %d Done\n", frame_count++);
+
+ return 0;
+}
--- /dev/null
+++ b/vp9/decoder/dequantize.c
@@ -1,0 +1,543 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "vp9/common/idct.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxd_int.h"
+
+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);
+extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);
+
+#if CONFIG_LOSSLESS
+extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,
+ int pitch);
+extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,
+ int pitch);
+#endif
+
+#ifdef DEC_DEBUG
+extern int dec_debug;
+#endif
+
+void vp9_dequantize_b_c(BLOCKD *d) {
+
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = d->dequant;
+
+ for (i = 0; i < 16; i++) {
+ DQ[i] = Q[i] * DQC[i];
+ }
+}
+
+
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride) {
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);
+
+ vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride) {
+ short output[64];
+ short *diff_ptr = output;
+ int b, r, c;
+ int i;
+ unsigned char *origdest = dest;
+ unsigned char *origpred = pred;
+
+ input[0] = dq[0] * input[0];
+ for (i = 1; i < 64; i++) {
+ input[i] = dq[1] * input[i];
+ }
+
+ vp9_ihtllm_c(input, output, 16, tx_type, 8);
+
+ vpx_memset(input, 0, 128);
+
+ for (b = 0; b < 4; b++) {
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 8;
+ pred += pitch;
+ }
+ // shift buffer pointers to next 4x4 block in the submacroblock
+ diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
+ dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
+ pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
+ }
+}
+
+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride) {
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ input[i] = dq[i] * input[i];
+ }
+
+ /* the idct halves ( >> 1) the pitch */
+ vp9_short_idct4x4llm_c(input, output, 4 << 1);
+
+ vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride,
+ int Dc) {
+ int i;
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+
+ input[0] = (short)Dc;
+
+ for (i = 1; i < 16; i++) {
+ input[i] = dq[i] * input[i];
+ }
+
+ /* the idct halves ( >> 1) the pitch */
+ vp9_short_idct4x4llm_c(input, output, 4 << 1);
+
+ vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride) {
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+
+ vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
+ unsigned char *pred,
+ unsigned char *dest,
+ int pitch, int stride, int dc) {
+ int i;
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+
+ input[0] = (short)dc;
+
+ for (i = 1; i < 16; i++) {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+ vpx_memset(input, 0, 32);
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+#endif
+
+void vp9_dequantize_b_2x2_c(BLOCKD *d) {
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+ short *DQC = d->dequant;
+
+ for (i = 0; i < 16; i++) {
+ DQ[i] = (short)((Q[i] * DQC[i]));
+ }
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Dequantize 2x2\n");
+ for (j = 0; j < 16; j++) printf("%d ", Q[j]);
+ printf("\n");
+ for (j = 0; j < 16; j++) printf("%d ", DQ[j]);
+ printf("\n");
+ }
+#endif
+}
+
+void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride) {
+ short output[64];
+ short *diff_ptr = output;
+ int r, c, b;
+ int i;
+ unsigned char *origdest = dest;
+ unsigned char *origpred = pred;
+
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Input 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", input[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+
+ input[0] = input[0] * dq[0];
+
+ // recover quantizer for 4 4x4 blocks
+ for (i = 1; i < 64; i++) {
+ input[i] = input[i] * dq[1];
+ }
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Input DQ 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", input[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+
+ // the idct halves ( >> 1) the pitch
+ vp9_short_idct8x8_c(input, output, 16);
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Output 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", output[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+
+ vpx_memset(input, 0, 128);// test what should i put here
+
+ for (b = 0; b < 4; b++) {
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 8;
+ pred += pitch;
+ }
+ diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+ dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+ pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+ }
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int k, j;
+ printf("Final 8x8\n");
+ for (j = 0; j < 8; j++) {
+ for (k = 0; k < 8; k++) {
+ printf("%d ", origdest[k]);
+ }
+ printf("\n");
+ origdest += stride;
+ }
+ }
+#endif
+}
+
+void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride,
+ int Dc) { // Dc for 1st order T in some rear case
+ short output[64];
+ short *diff_ptr = output;
+ int r, c, b;
+ int i;
+ unsigned char *origdest = dest;
+ unsigned char *origpred = pred;
+
+ input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
+ // dc value is recovered after dequantization, since dc need not quantization
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Input 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", input[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+ for (i = 1; i < 64; i++) {
+ input[i] = input[i] * dq[1];
+ }
+
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Input DQ 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", input[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+
+ // the idct halves ( >> 1) the pitch
+ vp9_short_idct8x8_c(input, output, 16);
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int j;
+ printf("Output 8x8\n");
+ for (j = 0; j < 64; j++) {
+ printf("%d ", output[j]);
+ if (j % 8 == 7) printf("\n");
+ }
+ }
+#endif
+ vpx_memset(input, 0, 128);
+
+ for (b = 0; b < 4; b++) {
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 8;
+ pred += pitch;
+ }
+ diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+ dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+ pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+ }
+#ifdef DEC_DEBUG
+ if (dec_debug) {
+ int k, j;
+ printf("Final 8x8\n");
+ for (j = 0; j < 8; j++) {
+ for (k = 0; k < 8; k++) {
+ printf("%d ", origdest[k]);
+ }
+ printf("\n");
+ origdest += stride;
+ }
+ }
+#endif
+}
+
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride) {
+ short output[256];
+ short *diff_ptr = output;
+ int r, c, i;
+
+ input[0]= input[0] * dq[0];
+
+ // recover quantizer for 4 4x4 blocks
+ for (i = 1; i < 256; i++)
+ input[i] = input[i] * dq[1];
+
+ // inverse hybrid transform
+ vp9_ihtllm_c(input, output, 32, tx_type, 16);
+
+ // the idct halves ( >> 1) the pitch
+ // vp9_short_idct16x16_c(input, output, 32);
+
+ vpx_memset(input, 0, 512);
+
+ for (r = 0; r < 16; r++) {
+ for (c = 0; c < 16; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 16;
+ pred += pitch;
+ }
+}
+
+void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride) {
+ short output[256];
+ short *diff_ptr = output;
+ int r, c, i;
+
+ input[0]= input[0] * dq[0];
+
+ // recover quantizer for 4 4x4 blocks
+ for (i = 1; i < 256; i++)
+ input[i] = input[i] * dq[1];
+
+ // the idct halves ( >> 1) the pitch
+ vp9_short_idct16x16_c(input, output, 32);
+
+ vpx_memset(input, 0, 512);
+
+ for (r = 0; r < 16; r++) {
+ for (c = 0; c < 16; c++) {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 16;
+ pred += pitch;
+ }
+}
--- /dev/null
+++ b/vp9/decoder/dequantize.h
@@ -1,0 +1,78 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_H
+#define DEQUANTIZE_H
+#include "vp9/common/blockd.h"
+
+#if CONFIG_LOSSLESS
+extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+ unsigned char *pred,
+ unsigned char *output,
+ int pitch, int stride);
+extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
+ unsigned char *pred,
+ unsigned char *output,
+ int pitch, int stride, int dc);
+extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc);
+extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs);
+extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int stride, char *eobs);
+#endif
+
+typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,
+ unsigned char *pred, unsigned char *output, int pitch, int stride);
+typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,
+ unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
+
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);
+typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst, int stride, char *eobs);
+typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
+ char *eobs);
+
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride);
+
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride);
+
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride);
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc, MACROBLOCKD *xd);
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs,
+ MACROBLOCKD *xd);
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/decoder/detokenize.c
@@ -1,0 +1,640 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/type_aliases.h"
+#include "vp9/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+#include "vp9/common/seg_common.h"
+
+#define BOOL_DATA UINT8
+
+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
+
+DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {
+ 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
+ 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
+};
+DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {
+ 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,
+ 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+};
+
+DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {
+ 0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+ 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,
+ 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X
+};
+
+#define EOB_CONTEXT_NODE 0
+#define ZERO_CONTEXT_NODE 1
+#define ONE_CONTEXT_NODE 2
+#define LOW_VAL_CONTEXT_NODE 3
+#define TWO_CONTEXT_NODE 4
+#define THREE_CONTEXT_NODE 5
+#define HIGH_LOW_CONTEXT_NODE 6
+#define CAT_ONE_CONTEXT_NODE 7
+#define CAT_THREEFOUR_CONTEXT_NODE 8
+#define CAT_THREE_CONTEXT_NODE 9
+#define CAT_FIVE_CONTEXT_NODE 10
+
+#define CAT1_MIN_VAL 5
+#define CAT2_MIN_VAL 7
+#define CAT3_MIN_VAL 11
+#define CAT4_MIN_VAL 19
+#define CAT5_MIN_VAL 35
+#define CAT6_MIN_VAL 67
+#define CAT1_PROB0 159
+#define CAT2_PROB0 145
+#define CAT2_PROB1 165
+
+#define CAT3_PROB0 140
+#define CAT3_PROB1 148
+#define CAT3_PROB2 173
+
+#define CAT4_PROB0 135
+#define CAT4_PROB1 140
+#define CAT4_PROB2 155
+#define CAT4_PROB3 176
+
+#define CAT5_PROB0 130
+#define CAT5_PROB1 134
+#define CAT5_PROB2 141
+#define CAT5_PROB3 157
+#define CAT5_PROB4 180
+
+static const unsigned char cat6_prob[14] =
+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+
+void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {
+ /* Clear entropy contexts for Y2 blocks */
+ if ((xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV)
+ || xd->mode_info_context->mbmi.txfm_size == TX_16X16
+ ) {
+ vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ } else {
+ vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+ }
+}
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+// #define PREV_CONTEXT_INC(val) (2+((val)>2))
+// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])
+#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])
+
+static int get_token(int v) {
+ if (v < 0) v = -v;
+ if (v == 0) return ZERO_TOKEN;
+ else if (v == 1) return ONE_TOKEN;
+ else if (v == 2) return TWO_TOKEN;
+ else if (v == 3) return THREE_TOKEN;
+ else if (v == 4) return FOUR_TOKEN;
+ else if (v <= 6) return DCT_VAL_CATEGORY1;
+ else if (v <= 10) return DCT_VAL_CATEGORY2;
+ else if (v <= 18) return DCT_VAL_CATEGORY3;
+ else if (v <= 34) return DCT_VAL_CATEGORY4;
+ else if (v <= 66) return DCT_VAL_CATEGORY5;
+ else return DCT_VAL_CATEGORY6;
+}
+
+void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,
+ int block, PLANE_TYPE type,
+ TX_TYPE tx_type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int eob, int seg_eob,
+ FRAME_CONTEXT *fc) {
+ int c, pt, token, band;
+ const int *scan;
+
+ switch(tx_type) {
+ case ADST_DCT :
+ scan = vp9_row_scan;
+ break;
+
+ case DCT_ADST :
+ scan = vp9_col_scan;
+ break;
+
+ default :
+ scan = vp9_default_zig_zag1d;
+ break;
+ }
+
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ for (c = !type; c < eob; ++c) {
+ int rc = scan[c];
+ int v = qcoeff_ptr[rc];
+ band = vp9_coef_bands[c];
+ token = get_token(v);
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts[type][band][pt][token]++;
+ else
+ fc->coef_counts[type][band][pt][token]++;
+ pt = vp9_prev_token_class[token];
+ }
+
+ if (eob < seg_eob) {
+ band = vp9_coef_bands[c];
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+ else
+ fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+ }
+}
+
+void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int eob, int seg_eob, FRAME_CONTEXT *const fc) {
+ int c, pt, token, band;
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ for (c = !type; c < eob; ++c) {
+ int rc = vp9_default_zig_zag1d[c];
+ int v = qcoeff_ptr[rc];
+ band = vp9_coef_bands[c];
+ token = get_token(v);
+ fc->coef_counts[type][band][pt][token]++;
+ pt = vp9_prev_token_class[token];
+ }
+ if (eob < seg_eob) {
+ band = vp9_coef_bands[c];
+ fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;
+ }
+}
+
+void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+ TX_TYPE tx_type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int eob, int seg_eob, FRAME_CONTEXT *fc) {
+ int c, pt, token, band;
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ for (c = !type; c < eob; ++c) {
+ int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);
+ int v = qcoeff_ptr[rc];
+ band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
+ token = get_token(v);
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts_8x8[type][band][pt][token]++;
+ else
+ fc->coef_counts_8x8[type][band][pt][token]++;
+ pt = vp9_prev_token_class[token];
+ }
+ if (eob < seg_eob) {
+ band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
+ else
+ fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;
+ }
+}
+
+void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,
+ TX_TYPE tx_type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int eob, int seg_eob, FRAME_CONTEXT *fc) {
+ int c, pt, token;
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ for (c = !type; c < eob; ++c) {
+ int rc = vp9_default_zig_zag1d_16x16[c];
+ int v = qcoeff_ptr[rc];
+ int band = vp9_coef_bands_16x16[c];
+ token = get_token(v);
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts_16x16[type][band][pt][token]++;
+ else
+ fc->coef_counts_16x16[type][band][pt][token]++;
+ pt = vp9_prev_token_class[token];
+ }
+ if (eob < seg_eob) {
+ int band = vp9_coef_bands_16x16[c];
+ if (tx_type != DCT_DCT)
+ fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
+ else
+ fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;
+ }
+}
+
+static int get_signed(BOOL_DECODER *br, int value_to_sign) {
+ const int split = (br->range + 1) >> 1;
+ const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
+ int v;
+
+ if (br->count < 0)
+ vp9_bool_decoder_fill(br);
+
+ if (br->value < bigsplit) {
+ br->range = split;
+ v = value_to_sign;
+ } else {
+ br->range = br->range - split;
+ br->value = br->value - bigsplit;
+ v = -value_to_sign;
+ }
+ br->range += br->range;
+ br->value += br->value;
+ --br->count;
+
+ return v;
+}
+
+#define WRITE_COEF_CONTINUE(val) \
+ { \
+ prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\
+ qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val); \
+ c++; \
+ continue; \
+ }
+
+#define ADJUST_COEF(prob, bits_count) \
+ do { \
+ if (vp9_read(br, prob)) \
+ val += (UINT16)(1 << bits_count);\
+ } while (0);
+
+static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
+ BOOL_DECODER* const br,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ PLANE_TYPE type,
+ TX_TYPE tx_type,
+ int seg_eob, INT16 *qcoeff_ptr, int i,
+ const int *const scan, int block_type,
+ const int *coef_bands) {
+ FRAME_CONTEXT *const fc = &dx->common.fc;
+ int tmp, c = (type == PLANE_TYPE_Y_NO_DC);
+ const vp9_prob *prob, *coef_probs;
+
+ switch (block_type) {
+ default:
+ case TX_4X4:
+ coef_probs =
+ tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :
+ fc->coef_probs[type][0][0];
+ break;
+ case TX_8X8:
+ coef_probs =
+ tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :
+ fc->coef_probs_8x8[type][0][0];
+ break;
+ case TX_16X16:
+ coef_probs =
+ tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :
+ fc->coef_probs_16x16[type][0][0];
+ break;
+ }
+
+ VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);
+ prob = coef_probs + tmp * ENTROPY_NODES;
+
+ while (1) {
+ int val;
+ const uint8_t *cat6 = cat6_prob;
+ if (c == seg_eob) break;
+ prob += coef_bands[c];
+ if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
+ break;
+SKIP_START:
+ if (c == seg_eob) break;
+ if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
+ ++c;
+ prob = coef_probs + coef_bands[c];
+ goto SKIP_START;
+ }
+ // ONE_CONTEXT_NODE_0_
+ if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {
+ prob = coef_probs + ENTROPY_NODES;
+ qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);
+ ++c;
+ continue;
+ }
+ // LOW_VAL_CONTEXT_NODE_0_
+ if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {
+ if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {
+ WRITE_COEF_CONTINUE(2);
+ }
+ if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {
+ WRITE_COEF_CONTINUE(3);
+ }
+ WRITE_COEF_CONTINUE(4);
+ }
+ // HIGH_LOW_CONTEXT_NODE_0_
+ if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {
+ if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {
+ val = CAT1_MIN_VAL;
+ ADJUST_COEF(CAT1_PROB0, 0);
+ WRITE_COEF_CONTINUE(val);
+ }
+ val = CAT2_MIN_VAL;
+ ADJUST_COEF(CAT2_PROB1, 1);
+ ADJUST_COEF(CAT2_PROB0, 0);
+ WRITE_COEF_CONTINUE(val);
+ }
+ // CAT_THREEFOUR_CONTEXT_NODE_0_
+ if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
+ if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {
+ val = CAT3_MIN_VAL;
+ ADJUST_COEF(CAT3_PROB2, 2);
+ ADJUST_COEF(CAT3_PROB1, 1);
+ ADJUST_COEF(CAT3_PROB0, 0);
+ WRITE_COEF_CONTINUE(val);
+ }
+ val = CAT4_MIN_VAL;
+ ADJUST_COEF(CAT4_PROB3, 3);
+ ADJUST_COEF(CAT4_PROB2, 2);
+ ADJUST_COEF(CAT4_PROB1, 1);
+ ADJUST_COEF(CAT4_PROB0, 0);
+ WRITE_COEF_CONTINUE(val);
+ }
+ // CAT_FIVE_CONTEXT_NODE_0_:
+ if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {
+ val = CAT5_MIN_VAL;
+ ADJUST_COEF(CAT5_PROB4, 4);
+ ADJUST_COEF(CAT5_PROB3, 3);
+ ADJUST_COEF(CAT5_PROB2, 2);
+ ADJUST_COEF(CAT5_PROB1, 1);
+ ADJUST_COEF(CAT5_PROB0, 0);
+ WRITE_COEF_CONTINUE(val);
+ }
+ val = 0;
+ while (*cat6) {
+ val = (val << 1) | vp9_read(br, *cat6++);
+ }
+ val += CAT6_MIN_VAL;
+ WRITE_COEF_CONTINUE(val);
+ }
+
+ if (block_type == TX_4X4) {
+ count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,
+ tx_type,
+ a, l, c, seg_eob, fc);
+ }
+ else if (block_type == TX_8X8)
+ count_tokens_8x8(qcoeff_ptr, i, type,
+ tx_type,
+ a, l, c, seg_eob, fc);
+ else
+ count_tokens_16x16(qcoeff_ptr, i, type,
+ tx_type,
+ a, l, c, seg_eob, fc);
+ return c;
+}
+
+int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
+ BOOL_DECODER* const bc) {
+ ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+ char* const eobs = xd->eobs;
+ PLANE_TYPE type;
+ int c, i, eobtotal = 0, seg_eob;
+ const int segment_id = xd->mode_info_context->mbmi.segment_id;
+ const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
+ INT16 *qcoeff_ptr = &xd->qcoeff[0];
+ TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);
+
+ type = PLANE_TYPE_Y_WITH_DC;
+
+ if (seg_active)
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ else
+ seg_eob = 256;
+
+ // Luma block
+ {
+ const int* const scan = vp9_default_zig_zag1d_16x16;
+ c = decode_coefs(pbi, xd, bc, A, L, type,
+ tx_type,
+ seg_eob, qcoeff_ptr,
+ 0, scan, TX_16X16, coef_bands_x_16x16);
+ eobs[0] = c;
+ A[0] = L[0] = (c != !type);
+ A[1] = A[2] = A[3] = A[0];
+ L[1] = L[2] = L[3] = L[0];
+ eobtotal += c;
+ }
+
+ // 8x8 chroma blocks
+ qcoeff_ptr += 256;
+ type = PLANE_TYPE_UV;
+ tx_type = DCT_DCT;
+ if (seg_active)
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ else
+ seg_eob = 64;
+ for (i = 16; i < 24; i += 4) {
+ ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
+ ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
+ const int* const scan = vp9_default_zig_zag1d_8x8;
+
+ c = decode_coefs(pbi, xd, bc, a, l, type,
+ tx_type,
+ seg_eob, qcoeff_ptr,
+ i, scan, TX_8X8, coef_bands_x_8x8);
+ a[0] = l[0] = ((eobs[i] = c) != !type);
+ a[1] = a[0];
+ l[1] = l[0];
+
+ eobtotal += c;
+ qcoeff_ptr += 64;
+ }
+ vpx_memset(&A[8], 0, sizeof(A[8]));
+ vpx_memset(&L[8], 0, sizeof(L[8]));
+ return eobtotal;
+}
+
+int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
+ BOOL_DECODER* const bc) {
+ ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+ char *const eobs = xd->eobs;
+ PLANE_TYPE type;
+ int c, i, eobtotal = 0, seg_eob;
+ const int segment_id = xd->mode_info_context->mbmi.segment_id;
+ const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
+ INT16 *qcoeff_ptr = &xd->qcoeff[0];
+ TX_TYPE tx_type = DCT_DCT;
+
+ int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+ xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;
+ if (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED) {
+ ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
+ ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
+ const int *const scan = vp9_default_zig_zag1d;
+ type = PLANE_TYPE_Y2;
+
+ if (seg_active)
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ else
+ seg_eob = 4;
+ c = decode_coefs(pbi, xd, bc, a, l, type,
+ tx_type,
+ seg_eob, qcoeff_ptr + 24 * 16,
+ 24, scan, TX_8X8, coef_bands_x);
+ a[0] = l[0] = ((eobs[24] = c) != !type);
+
+ eobtotal += c - 4;
+
+ type = PLANE_TYPE_Y_NO_DC;
+ } else
+ type = PLANE_TYPE_Y_WITH_DC;
+
+ if (seg_active)
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ else
+ seg_eob = 64;
+
+ for (i = 0; i < bufthred ; i += 4) {
+ ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
+ ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
+ const int *const scan = vp9_default_zig_zag1d_8x8;
+ tx_type = DCT_DCT;
+
+ if (i == 16)
+ type = PLANE_TYPE_UV;
+ if (type == PLANE_TYPE_Y_WITH_DC) {
+ tx_type = get_tx_type(xd, xd->block + i);
+ }
+
+ c = decode_coefs(pbi, xd, bc, a, l, type,
+ tx_type,
+ seg_eob, qcoeff_ptr,
+ i, scan, TX_8X8, coef_bands_x_8x8);
+ a[0] = l[0] = ((eobs[i] = c) != !type);
+ a[1] = a[0];
+ l[1] = l[0];
+
+ eobtotal += c;
+ qcoeff_ptr += 64;
+ }
+
+ if (bufthred == 16) {
+ type = PLANE_TYPE_UV;
+ tx_type = DCT_DCT;
+ seg_eob = 16;
+
+ // use 4x4 transform for U, V components in I8X8 prediction mode
+ for (i = 16; i < 24; i++) {
+ ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
+ ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+ const int *scan = vp9_default_zig_zag1d;
+
+ c = decode_coefs(pbi, xd, bc, a, l, type,
+ tx_type,
+ seg_eob, qcoeff_ptr,
+ i, scan, TX_4X4, coef_bands_x);
+ a[0] = l[0] = ((eobs[i] = c) != !type);
+
+ eobtotal += c;
+ qcoeff_ptr += 16;
+ }
+ }
+
+ return eobtotal;
+}
+
+
+int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,
+ BOOL_DECODER* const bc) {
+ ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
+
+ char *const eobs = xd->eobs;
+ const int *scan = vp9_default_zig_zag1d;
+ PLANE_TYPE type;
+ int c, i, eobtotal = 0, seg_eob = 16;
+ INT16 *qcoeff_ptr = &xd->qcoeff[0];
+
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV) {
+ ENTROPY_CONTEXT *const a = A + vp9_block2above[24];
+ ENTROPY_CONTEXT *const l = L + vp9_block2left[24];
+ type = PLANE_TYPE_Y2;
+
+ c = decode_coefs(dx, xd, bc, a, l, type,
+ DCT_DCT,
+ seg_eob, qcoeff_ptr + 24 * 16, 24,
+ scan, TX_4X4, coef_bands_x);
+ a[0] = l[0] = ((eobs[24] = c) != !type);
+ eobtotal += c - 16;
+
+ type = PLANE_TYPE_Y_NO_DC;
+ } else {
+ type = PLANE_TYPE_Y_WITH_DC;
+ }
+
+ for (i = 0; i < 24; ++i) {
+ ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
+ ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+ TX_TYPE tx_type = DCT_DCT;
+ if (i == 16)
+ type = PLANE_TYPE_UV;
+
+ tx_type = get_tx_type(xd, &xd->block[i]);
+ switch(tx_type) {
+ case ADST_DCT :
+ scan = vp9_row_scan;
+ break;
+
+ case DCT_ADST :
+ scan = vp9_col_scan;
+ break;
+
+ default :
+ scan = vp9_default_zig_zag1d;
+ break;
+ }
+
+ c = decode_coefs(dx, xd, bc, a, l, type, tx_type,
+ seg_eob, qcoeff_ptr,
+ i, scan, TX_4X4, coef_bands_x);
+ a[0] = l[0] = ((eobs[i] = c) != !type);
+
+ eobtotal += c;
+ qcoeff_ptr += 16;
+ }
+
+ return eobtotal;
+}
--- /dev/null
+++ b/vp9/decoder/detokenize.h
@@ -1,0 +1,25 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
+
+#include "onyxd_int.h"
+
+void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
+int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
+ BOOL_DECODER* const);
+int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,
+ BOOL_DECODER* const);
+int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,
+ BOOL_DECODER* const);
+
+#endif /* DETOKENIZE_H */
--- /dev/null
+++ b/vp9/decoder/idct_blk.c
@@ -1,0 +1,292 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "dequantize.h"
+
+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride,
+ int Dc);
+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
+ unsigned char *dest, int pitch, int stride);
+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ unsigned char *dst_ptr, int pitch, int stride);
+#if CONFIG_LOSSLESS
+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
+ unsigned char *pred, unsigned char *dest,
+ int pitch, int stride);
+void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,
+ unsigned char *dst_ptr,
+ int pitch, int stride);
+#endif
+
+void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);
+ else
+ vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);
+
+ q += 16;
+ pre += 4;
+ dst += 4;
+ dc++;
+ }
+
+ pre += 64 - 16;
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
+ else {
+ vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dst += 4;
+ }
+
+ pre += 64 - 16;
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,
+ unsigned char *dstu, unsigned char *dstv,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
+ else {
+ vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dstu += 4;
+ }
+
+ pre += 32 - 8;
+ dstu += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
+ else {
+ vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dstv += 4;
+ }
+
+ pre += 32 - 8;
+ dstv += 4 * stride - 8;
+ }
+}
+
+
+void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs, short *dc,
+ MACROBLOCKD *xd) {
+ vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
+ dst + 8 * stride, 16, stride, dc[4]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
+ dst + 8 * stride + 8, 16, stride, dc[8]);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc, MACROBLOCKD *xd) {
+ vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
+ dst + 8, stride, stride, dc[1]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
+ dst + 8 * stride, stride, stride, dc[4]);
+ vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
+ dst + 8 * stride + 8, stride, stride, dc[8]);
+}
+#endif
+
+void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs,
+ MACROBLOCKD *xd) {
+ unsigned char *origdest = dst;
+ unsigned char *origpred = pre;
+
+ vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+ vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
+ origdest + 8, 16, stride);
+ vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
+ origdest + 8 * stride, 16, stride);
+ vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
+ origdest + 8 * stride + 8, 16, stride);
+}
+
+void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs,
+ MACROBLOCKD *xd) {
+ vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
+
+ q += 64;
+ pre += 64;
+
+ vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs,
+ MACROBLOCKD *xd) {
+ vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+
+ q += 64;
+
+ vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+}
+#endif
+
+#if CONFIG_LOSSLESS
+void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);
+ else
+ vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);
+
+ q += 16;
+ pre += 4;
+ dst += 4;
+ dc++;
+ }
+
+ pre += 64 - 16;
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);
+ else {
+ vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dst += 4;
+ }
+
+ pre += 64 - 16;
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);
+ else {
+ vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dstu += 4;
+ }
+
+ pre += 32 - 8;
+ dstu += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1)
+ vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);
+ else {
+ vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ pre += 4;
+ dstv += 4;
+ }
+
+ pre += 32 - 8;
+ dstv += 4 * stride - 8;
+ }
+}
+#endif
+
--- /dev/null
+++ b/vp9/decoder/onyxd_if.c
@@ -1,0 +1,506 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp9/common/postproc.h"
+#endif
+#include "vp9/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/alloccommon.h"
+#include "vpx_scale/yv12extend.h"
+#include "vp9/common/loopfilter.h"
+#include "vp9/common/swapyv12buffer.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp9/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp9_init_de_quantizer(VP9D_COMP *pbi);
+static int get_free_fb(VP9_COMMON *cm);
+static void ref_cnt_fb(int *buf, int *idx, int new_idx);
+
+#if CONFIG_DEBUG
+static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
+ FILE *yuv_file = fopen((char *)name, "ab");
+ unsigned char *src = s->y_buffer;
+ int h = s->y_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fclose(yuv_file);
+}
+#endif
+#define WRITE_RECON_BUFFER 0
+#if WRITE_RECON_BUFFER
+void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
+
+ // write the frame
+ FILE *yframe;
+ int i;
+ char filename[255];
+
+ sprintf(filename, "dx\\y%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->y_height; i++)
+ fwrite(frame->y_buffer + i * frame->y_stride,
+ frame->y_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "dx\\u%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->u_buffer + i * frame->uv_stride,
+ frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "dx\\v%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->v_buffer + i * frame->uv_stride,
+ frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+}
+#endif
+
+void vp9_initialize_dec(void) {
+ static int init_done = 0;
+
+ if (!init_done) {
+ vp9_initialize_common();
+ vp9_init_quant_tables();
+ vp8_scale_machine_specific_config();
+ init_done = 1;
+ }
+}
+
+VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
+ VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));
+
+ if (!pbi)
+ return NULL;
+
+ vpx_memset(pbi, 0, sizeof(VP9D_COMP));
+
+ if (setjmp(pbi->common.error.jmp)) {
+ pbi->common.error.setjmp = 0;
+ vp9_remove_decompressor(pbi);
+ return 0;
+ }
+
+ pbi->common.error.setjmp = 1;
+ vp9_initialize_dec();
+
+ vp9_create_common(&pbi->common);
+
+ pbi->common.current_video_frame = 0;
+ pbi->ready_for_new_data = 1;
+
+ /* vp9_init_de_quantizer() is first called here. Add check in
+ * frame_init_dequantizer() to avoid unnecessary calling of
+ * vp9_init_de_quantizer() for every frame.
+ */
+ vp9_init_de_quantizer(pbi);
+
+ vp9_loop_filter_init(&pbi->common);
+
+ pbi->common.error.setjmp = 0;
+
+ pbi->decoded_key_frame = 0;
+
+ return (VP9D_PTR) pbi;
+}
+
+void vp9_remove_decompressor(VP9D_PTR ptr) {
+ VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+
+ if (!pbi)
+ return;
+
+ // Delete sementation map
+ if (pbi->common.last_frame_seg_map != 0)
+ vpx_free(pbi->common.last_frame_seg_map);
+
+ vp9_remove_common(&pbi->common);
+ vpx_free(pbi->mbc);
+ vpx_free(pbi);
+}
+
+
+vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+ VP9_COMMON *cm = &pbi->common;
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP9_LAST_FLAG)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP9_GOLD_FLAG)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP9_ALT_FLAG)
+ ref_fb_idx = cm->alt_fb_idx;
+ else {
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+ cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+ cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+ cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else
+ vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
+
+ return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+ VP9_COMMON *cm = &pbi->common;
+ int *ref_fb_ptr = NULL;
+ int free_fb;
+
+ if (ref_frame_flag == VP9_LAST_FLAG)
+ ref_fb_ptr = &cm->lst_fb_idx;
+ else if (ref_frame_flag == VP9_GOLD_FLAG)
+ ref_fb_ptr = &cm->gld_fb_idx;
+ else if (ref_frame_flag == VP9_ALT_FLAG)
+ ref_fb_ptr = &cm->alt_fb_idx;
+ else {
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+ cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+ cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+ cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else {
+ /* Find an empty frame buffer. */
+ free_fb = get_free_fb(cm);
+ /* Decrease fb_idx_ref_cnt since it will be increased again in
+ * ref_cnt_fb() below. */
+ cm->fb_idx_ref_cnt[free_fb]--;
+
+ /* Manage the reference counters and copy image. */
+ ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+ vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);
+ }
+
+ return pbi->common.error.error_code;
+}
+
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
+#if HAVE_ARMV7
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+#endif
+
+static int get_free_fb(VP9_COMMON *cm) {
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ if (cm->fb_idx_ref_cnt[i] == 0)
+ break;
+
+ assert(i < NUM_YV12_BUFFERS);
+ cm->fb_idx_ref_cnt[i] = 1;
+ return i;
+}
+
+static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
+ if (buf[*idx] > 0)
+ buf[*idx]--;
+
+ *idx = new_idx;
+
+ buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers(VP9_COMMON *cm) {
+ int err = 0;
+
+ /* The alternate reference frame or golden frame can be updated
+ * using the new, last, or golden/alt ref frame. If it
+ * is updated using the newly decoded frame it is a refresh.
+ * An update using the last or golden/alt ref frame is a copy.
+ */
+ if (cm->copy_buffer_to_arf) {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_arf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_arf == 2)
+ new_fb = cm->gld_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+ }
+
+ if (cm->copy_buffer_to_gf) {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_gf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_gf == 2)
+ new_fb = cm->alt_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+ }
+
+ if (cm->refresh_golden_frame)
+ ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_alt_ref_frame)
+ ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_last_frame) {
+ ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+ cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+ } else
+ cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ return err;
+}
+
+int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
+ const unsigned char *source,
+ int64_t time_stamp) {
+#if HAVE_ARMV7
+ int64_t dx_store_reg[8];
+#endif
+ VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+ VP9_COMMON *cm = &pbi->common;
+ int retcode = 0;
+
+ /*if(pbi->ready_for_new_data == 0)
+ return -1;*/
+
+ if (ptr == 0) {
+ return -1;
+ }
+
+ pbi->common.error.error_code = VPX_CODEC_OK;
+
+ pbi->Source = source;
+ pbi->source_sz = size;
+
+ if (pbi->source_sz == 0) {
+ /* This is used to signal that we are missing frames.
+ * We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+ }
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_push_neon(dx_store_reg);
+ }
+#endif
+
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (setjmp(pbi->common.error.jmp)) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(dx_store_reg);
+ }
+#endif
+ pbi->common.error.setjmp = 0;
+
+ /* We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+ return -1;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ retcode = vp9_decode_frame(pbi);
+
+ if (retcode < 0) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(dx_store_reg);
+ }
+#endif
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ pbi->common.error.setjmp = 0;
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+ return retcode;
+ }
+
+ {
+ if (swap_frame_buffers(cm)) {
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(dx_store_reg);
+ }
+#endif
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ pbi->common.error.setjmp = 0;
+ return -1;
+ }
+
+#if WRITE_RECON_BUFFER
+ if (cm->show_frame)
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame);
+ else
+ write_dx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame + 1000);
+#endif
+
+ if (cm->filter_level) {
+ /* Apply the loop filter if appropriate. */
+ vp9_loop_filter_frame(cm, &pbi->mb);
+ }
+ vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+ }
+
+#if CONFIG_DEBUG
+ if (cm->show_frame)
+ recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+#endif
+
+ vp9_clear_system_state();
+
+ if (cm->show_frame) {
+ vpx_memcpy(cm->prev_mip, cm->mip,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+ } else {
+ vpx_memset(cm->prev_mip, 0,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+ }
+
+ /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
+ cm->current_video_frame);*/
+
+ if (cm->show_frame)
+ cm->current_video_frame++;
+
+ pbi->ready_for_new_data = 0;
+ pbi->last_time_stamp = time_stamp;
+ pbi->source_sz = 0;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(dx_store_reg);
+ }
+#endif
+ pbi->common.error.setjmp = 0;
+ return retcode;
+}
+
+int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
+ int64_t *time_stamp, int64_t *time_end_stamp,
+ vp9_ppflags_t *flags) {
+ int ret = -1;
+ VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+
+ if (pbi->ready_for_new_data == 1)
+ return ret;
+
+ /* ie no raw frame to show!!! */
+ if (pbi->common.show_frame == 0)
+ return ret;
+
+ pbi->ready_for_new_data = 1;
+ *time_stamp = pbi->last_time_stamp;
+ *time_end_stamp = 0;
+
+ sd->clrtype = pbi->common.clr_type;
+#if CONFIG_POSTPROC
+ ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+#else
+
+ if (pbi->common.frame_to_show) {
+ *sd = *pbi->common.frame_to_show;
+ sd->y_width = pbi->common.Width;
+ sd->y_height = pbi->common.Height;
+ sd->uv_height = pbi->common.Height / 2;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+
+#endif /*!CONFIG_POSTPROC*/
+ vp9_clear_system_state();
+ return ret;
+}
--- /dev/null
+++ b/vp9/decoder/onyxd_int.h
@@ -1,0 +1,106 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYXD_INT_H
+#define __INC_ONYXD_INT_H
+#include "vpx_ports/config.h"
+#include "vp9/common/onyxd.h"
+#include "treereader.h"
+#include "vp9/common/onyxc_int.h"
+#include "dequantize.h"
+
+// #define DEC_DEBUG
+
+typedef struct {
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct {
+ MACROBLOCKD mbd;
+ int mb_row;
+ int current_mb_col;
+ short *coef_ptr;
+} MB_ROW_DEC;
+
+typedef struct {
+ int const *scan;
+ int const *scan_8x8;
+ UINT8 const *ptr_block2leftabove;
+ vp9_tree_index const *vp9_coef_tree_ptr;
+ unsigned char *norm_ptr;
+ UINT8 *ptr_coef_bands_x;
+ UINT8 *ptr_coef_bands_x_8x8;
+
+ ENTROPY_CONTEXT_PLANES *A;
+ ENTROPY_CONTEXT_PLANES *L;
+
+ INT16 *qcoeff_start_ptr;
+
+ vp9_prob const *coef_probs[BLOCK_TYPES];
+ vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
+ vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
+
+ UINT8 eob[25];
+
+} DETOK;
+
+typedef struct VP9Decompressor {
+ DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+ DECLARE_ALIGNED(16, VP9_COMMON, common);
+
+ VP9D_CONFIG oxcf;
+
+
+ const unsigned char *Source;
+ unsigned int source_sz;
+
+ vp9_reader *mbc;
+ int64_t last_time_stamp;
+ int ready_for_new_data;
+
+ DETOK detoken;
+
+ vp9_dequant_idct_add_fn_t idct_add;
+ vp9_dequant_dc_idct_add_fn_t dc_idct_add;
+ vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
+ vp9_dequant_idct_add_y_block_fn_t idct_add_y_block;
+ vp9_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
+
+ vp9_prob prob_skip_false;
+
+ int decoded_key_frame;
+
+} VP9D_COMP;
+
+int vp9_decode_frame(VP9D_COMP *cpi);
+
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+
+#endif // __INC_ONYXD_INT_H
--- /dev/null
+++ b/vp9/decoder/reconintra_mt.h
@@ -1,0 +1,15 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA_MT_H
+#define __INC_RECONINTRA_MT_H
+
+#endif
--- /dev/null
+++ b/vp9/decoder/treereader.h
@@ -1,0 +1,37 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tree_reader_h
+#define tree_reader_h 1
+
+#include "vp9/common/treecoder.h"
+
+#include "dboolhuff.h"
+
+typedef BOOL_DECODER vp9_reader;
+
+#define vp9_read decode_bool
+#define vp9_read_literal decode_value
+#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
+ vp9_tree t,
+ const vp9_prob *const p) {
+ register vp9_tree_index i = 0;
+
+ while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);
+
+ return -i;
+}
+
+#endif /* tree_reader_h */
--- /dev/null
+++ b/vp9/decoder/x86/dequantize_mmx.asm
@@ -1,0 +1,406 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+align 16
+x_s1sqr2: times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1: times 4 dw 0x4E7B
+align 16
+pw_16: times 4 dw 16
+
+SECTION .text
+
+INIT_MMX
+
+
+;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3
+ mova m1, [sqq]
+ pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+ 0], m1
+
+ mova m1, [sqq+8]
+ pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+ 8], m1
+
+ mova m1, [sqq+16]
+ pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+16], m1
+
+ mova m1, [sqq+24]
+ pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers.
+ mova [dqq+24], m1
+ RET
+
+
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride
+
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+ movsxd pitq, dword pitm
+%else
+ mov strideq, stridem
+ mov pitq, pitm
+%endif
+
+ mova m0, [inpq+ 0]
+ pmullw m0, [dqq]
+
+ mova m1, [inpq+ 8]
+ pmullw m1, [dqq+ 8]
+
+ mova m2, [inpq+16]
+ pmullw m2, [dqq+16]
+
+ mova m3, [inpq+24]
+ pmullw m3, [dqq+24]
+
+ pxor m7, m7
+ mova [inpq], m7
+ mova [inpq+8], m7
+ mova [inpq+16], m7
+ mova [inpq+24], m7
+
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ pxor m7, m7
+
+ movh m4, [predq]
+ punpcklbw m4, m7
+ paddsw m0, m4
+ packuswb m0, m7
+ movh [destq], m0
+
+ movh m4, [predq+pitq]
+ punpcklbw m4, m7
+ paddsw m1, m4
+ packuswb m1, m7
+ movh [destq+strideq], m1
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m2, m4
+ packuswb m2, m7
+ movh [destq+strideq*2], m2
+
+ add destq, strideq
+ add predq, pitq
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m5, m4
+ packuswb m5, m7
+ movh [destq+strideq*2], m5
+ RET
+
+
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc
+
+%if ARCH_X86_64
+ movsxd strideq, dword stridem
+ movsxd pitq, dword pitm
+%else
+ mov strideq, stridem
+ mov pitq, pitm
+%endif
+
+ mov Dcq, Dcm
+ mova m0, [inpq+ 0]
+ pmullw m0, [dqq+ 0]
+
+ mova m1, [inpq+ 8]
+ pmullw m1, [dqq+ 8]
+
+ mova m2, [inpq+16]
+ pmullw m2, [dqq+16]
+
+ mova m3, [inpq+24]
+ pmullw m3, [dqq+24]
+
+ pxor m7, m7
+ mova [inpq+ 0], m7
+ mova [inpq+ 8], m7
+ mova [inpq+16], m7
+ mova [inpq+24], m7
+
+ ; move lower word of Dc to lower word of m0
+ psrlq m0, 16
+ psllq m0, 16
+ and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision.
+ movh m7, Dcq
+ por m0, m7
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ mova m3, m5 ; 33 23 13 03
+
+ psubw m0, m2 ; b1= 0-2
+ paddw m2, m2 ;
+
+ mova m5, m1
+ paddw m2, m0 ; a1 =0+2
+
+ pmulhw m5, [x_s1sqr2];
+ paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ mova m7, m3 ;
+ pmulhw m7, [x_c1sqr2less1];
+
+ paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw m7, m5 ; c1
+
+ mova m5, m1
+ mova m4, m3
+
+ pmulhw m5, [x_c1sqr2less1]
+ paddw m5, m1
+
+ pmulhw m3, [x_s1sqr2]
+ paddw m3, m4
+
+ paddw m3, m5 ; d1
+ paddw m0, [pw_16]
+
+ paddw m2, [pw_16]
+ mova m6, m2 ; a1
+
+ mova m4, m0 ; b1
+ paddw m2, m3 ;0
+
+ paddw m4, m7 ;1
+ psubw m0, m7 ;2
+
+ psubw m6, m3 ;3
+ psraw m2, 5
+
+ psraw m0, 5
+ psraw m4, 5
+
+ psraw m6, 5
+
+ mova m1, m2 ; 03 02 01 00
+ mova m3, m4 ; 23 22 21 20
+
+ punpcklwd m1, m0 ; 11 01 10 00
+ punpckhwd m2, m0 ; 13 03 12 02
+
+ punpcklwd m3, m6 ; 31 21 30 20
+ punpckhwd m4, m6 ; 33 23 32 22
+
+ mova m0, m1 ; 11 01 10 00
+ mova m5, m2 ; 13 03 12 02
+
+ punpckldq m0, m3 ; 30 20 10 00
+ punpckhdq m1, m3 ; 31 21 11 01
+
+ punpckldq m2, m4 ; 32 22 12 02
+ punpckhdq m5, m4 ; 33 23 13 03
+
+ pxor m7, m7
+
+ movh m4, [predq]
+ punpcklbw m4, m7
+ paddsw m0, m4
+ packuswb m0, m7
+ movh [destq], m0
+
+ movh m4, [predq+pitq]
+ punpcklbw m4, m7
+ paddsw m1, m4
+ packuswb m1, m7
+ movh [destq+strideq], m1
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m2, m4
+ packuswb m2, m7
+ movh [destq+strideq*2], m2
+
+ add destq, strideq
+ add predq, pitq
+
+ movh m4, [predq+2*pitq]
+ punpcklbw m4, m7
+ paddsw m5, m4
+ packuswb m5, m7
+ movh [destq+strideq*2], m5
+ RET
+
--- /dev/null
+++ b/vp9/decoder/x86/idct_blk_mmx.c
@@ -1,0 +1,143 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs, short *dc) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (eobs[0] > 1)
+ vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);
+ else
+ vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);
+
+ if (eobs[1] > 1)
+ vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,
+ dst + 4, 16, stride, dc[1]);
+ else
+ vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);
+
+ if (eobs[2] > 1)
+ vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,
+ dst + 8, 16, stride, dc[2]);
+ else
+ vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);
+
+ if (eobs[3] > 1)
+ vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,
+ dst + 12, 16, stride, dc[3]);
+ else
+ vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);
+
+ q += 64;
+ dc += 4;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (eobs[0] > 1)
+ vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ if (eobs[2] > 1)
+ vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
+ ((int *)(q + 32))[0] = 0;
+ }
+
+ if (eobs[3] > 1)
+ vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
+ ((int *)(q + 48))[0] = 0;
+ }
+
+ q += 64;
+ pre += 64;
+ dst += 4 * stride;
+ eobs += 4;
+ }
+}
+
+void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ if (eobs[0] > 1)
+ vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ q += 32;
+ pre += 32;
+ dstu += 4 * stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++) {
+ if (eobs[0] > 1)
+ vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);
+ else {
+ vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
+ ((int *)(q + 16))[0] = 0;
+ }
+
+ q += 32;
+ pre += 32;
+ dstv += 4 * stride;
+ eobs += 2;
+ }
+}
--- /dev/null
+++ b/vp9/decoder/x86/idct_blk_sse2.c
@@ -1,0 +1,116 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/decoder/dequantize.h"
+
+void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst,
+ int dst_stride, short *dc);
+
+void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst,
+ int dst_stride, short *dc);
+
+void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst,
+ int dst_stride, int blk_stride);
+
+void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst,
+ int dst_stride, int blk_stride);
+
+void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dst,
+ int stride, char *eobs, short *dc) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);
+ else
+ vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);
+
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,
+ stride, dc + 2);
+ else
+ vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,
+ stride, dc + 2);
+
+ q += 64;
+ dc += 4;
+ pre += 64;
+ dst += stride * 4;
+ eobs += 4;
+ }
+}
+
+void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,
+ unsigned char *pre, unsigned char *dst,
+ int stride, char *eobs) {
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);
+ else
+ vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);
+
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
+ else
+ vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
+
+ q += 64;
+ pre += 64;
+ dst += stride * 4;
+ eobs += 4;
+ }
+}
+
+void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,
+ unsigned char *pre,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs) {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
+ else
+ vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
+
+ q += 32;
+ pre += 32;
+ dstu += stride * 4;
+
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
+ else
+ vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
+
+ q += 32;
+ pre += 32;
+
+ if (((short *)(eobs))[2] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
+ else
+ vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
+
+ q += 32;
+ pre += 32;
+ dstv += stride * 4;
+
+ if (((short *)(eobs))[3] & 0xfefe)
+ vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
+ else
+ vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
+}
--- /dev/null
+++ b/vp9/decoder/x86/x86_dsystemdependent.c
@@ -1,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/decoder/onyxd_int.h"
+
+#if HAVE_MMX
+void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp9_dequantize_b_mmx(BLOCKD *d) {
+ short *sq = (short *) d->qcoeff;
+ short *dq = (short *) d->dqcoeff;
+ short *q = (short *) d->dequant;
+ vp9_dequantize_b_impl_mmx(sq, dq, q);
+}
+#endif
+
+
--- /dev/null
+++ b/vp9/encoder/arm/arm_csystemdependent.c
@@ -1,0 +1,129 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/arm.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ int flags = cpi->common.rtcd.flags;
+
+#if HAVE_ARMV5TE
+ if (flags & HAS_EDSP) {
+ }
+#endif
+
+#if HAVE_ARMV6
+ if (flags & HAS_MEDIA) {
+ cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6;
+ /*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c;
+ cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c;
+ cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c;
+ cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/
+
+ /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
+ cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6;
+ /*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c;
+ cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/
+ cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6;
+
+ /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
+ cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6;
+ /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
+ cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6;
+ cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6;
+ cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6;
+ cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6;
+
+ cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6;
+ /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
+
+ cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6;
+ cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6;
+ cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6;
+ cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6;
+ cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6;
+
+ /*cpi->rtcd.encodemb.berr = vp9_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
+ cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6;
+ cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6;
+ cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6;
+
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
+ }
+#endif
+
+#if HAVE_ARMV7
+ if (flags & HAS_NEON) {
+ cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon;
+ cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon;
+ cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon;
+ cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon;
+ cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon;
+
+ /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
+ cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon;
+ cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon;
+ cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon;
+ cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon;
+
+ /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
+ cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon;
+ /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
+ cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
+ cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon;
+ cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon;
+ cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon;
+ cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon;
+
+ cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon;
+ /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
+
+ cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon;
+ cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon;
+ cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon;
+ cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon;
+ cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon;
+
+ /*cpi->rtcd.encodemb.berr = vp9_block_error_c;
+ cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
+ cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
+ cpi->rtcd.encodemb.subb = vp9_subtract_b_neon;
+ cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon;
+ cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon;
+
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
+ cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
+ cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
+ }
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (flags & HAS_NEON)
+#endif
+ {
+ vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+ }
+#endif
+#endif
+}
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -1,0 +1,286 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_start_encode|
+ EXPORT |vp9_encode_bool|
+ EXPORT |vp8_stop_encode|
+ EXPORT |vp8_encode_value|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+
+|vp8_start_encode| PROC
+ mov r12, #0
+ mov r3, #255
+ mvn r2, #23
+ str r12, [r0, #vp9_writer_lowvalue]
+ str r3, [r0, #vp9_writer_range]
+ str r12, [r0, #vp9_writer_value]
+ str r2, [r0, #vp9_writer_count]
+ str r12, [r0, #vp9_writer_pos]
+ str r1, [r0, #vp9_writer_buffer]
+ bx lr
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp9_encode_bool| PROC
+ push {r4-r9, lr}
+
+ mov r4, r2
+
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+
+ sub r7, r5, #1 ; range-1
+
+ cmp r1, #0
+ mul r6, r4, r7 ; ((range-1) * probability)
+
+ mov r7, #1
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
+
+ addne r2, r2, r4 ; if (bit) lowvalue += split
+ subne r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ pop {r4-r9, pc}
+ ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+ push {r4-r10, lr}
+
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+
+ mov r10, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne stop_encode_loop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ pop {r4-r10, pc}
+
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+ push {r4-r11, lr}
+
+ mov r10, r2
+
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+
+ rsb r4, r10, #32 ; 32-n
+
+ ; v is kept in r1 during the token pack loop
+ lsl r1, r1, r4 ; r1 = v << 32 - n
+
+encode_value_loop
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r1, r1, #1 ; bit = v >> n
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ addcs r2, r2, r4 ; if (bit) lowvalue += split
+ subcs r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_ev ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_ev
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_ev
+token_zero_while_loop_ev
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_ev
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_ev
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne encode_value_loop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ pop {r4-r11, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -1,0 +1,291 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_armv5|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 vp9_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+ push {r4-r11, lr}
+
+ ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+ ; sizeof (TOKENEXTRA) is 8
+ sub sp, sp, #12
+ add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
+ str r2, [sp, #0]
+ str r3, [sp, #8] ; save vp8_coef_encodings
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+ b check_p_lt_stop
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #8] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #48] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ add sp, sp, #12
+ pop {r4-r11, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -1,0 +1,327 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp9_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #24
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r2, [sp, #20] ; save vp8_coef_encodings
+ str r5, [sp, #12] ; save mb_rows
+ str r3, [sp, #8] ; save vp8_extra_bits
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+
+ mov r0, r1 ; keep same as other loops
+
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actuall work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #20] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #8] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, #1
+ add r7, r7, #TOKENLIST_SZ ; next element in the array
+ str r6, [sp, #12]
+ bne mb_row_loop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ add sp, sp, #24
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -1,0 +1,465 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp9_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #44
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r5, [sp, #36] ; save mb_rows
+ str r1, [sp, #24] ; save cx_data
+ str r2, [sp, #20] ; save num_part
+ str r3, [sp, #8] ; save *size
+
+ ; *size = 3*(num_part -1 );
+ sub r2, r2, #1 ; num_part - 1
+ add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
+ str r2, [r3]
+
+ add r2, r2, r1 ; cx_data + *size
+ str r2, [sp, #40] ; ptr
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+ str r7, [sp, #32] ; store start of cpi->tp_list
+
+ ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
+ add r0, r0, r11
+
+ mov r11, #0
+ str r11, [sp, #28] ; i
+
+numparts_loop
+ ldr r10, [sp, #40] ; ptr
+ ldr r5, [sp, #36] ; move mb_rows to the counting section
+ sub r5, r5, r11 ; move start point with each partition
+ ; mb_rows starts at i
+ str r5, [sp, #12]
+
+ ; Reset all of the VP8 Writer data for each partition that
+ ; is processed.
+ ; start_encode
+ mov r2, #0 ; vp9_writer_lowvalue
+ mov r5, #255 ; vp9_writer_range
+ mvn r3, #23 ; vp9_writer_count
+
+ str r2, [r0, #vp9_writer_value]
+ str r2, [r0, #vp9_writer_pos]
+ str r10, [r0, #vp9_writer_buffer]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actual work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #80] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #84] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r10, [sp, #20] ; num_parts
+ mov r1, #TOKENLIST_SZ
+ mul r1, r10, r1
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, r10
+ add r7, r7, r1 ; next element in the array
+ str r6, [sp, #12]
+ bgt mb_row_loop
+
+ mov r12, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r12, r12, #1
+ bne stop_encode_loop
+
+ ldr r10, [sp, #8] ; *size
+ ldr r11, [r10]
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ add r11, r11, r4 ; *size += w->pos
+ str r11, [r10]
+
+ ldr r9, [sp, #20] ; num_parts
+ sub r9, r9, #1
+ ldr r10, [sp, #28] ; i
+ cmp r10, r9 ; if(i<(num_part - 1))
+ bge skip_write_partition
+
+ ldr r12, [sp, #40] ; ptr
+ add r12, r12, r4 ; ptr += w->pos
+ str r12, [sp, #40]
+
+ ldr r9, [sp, #24] ; cx_data
+ mov r8, r4, asr #8
+ strb r4, [r9, #0]
+ strb r8, [r9, #1]
+ mov r4, r4, asr #16
+ strb r4, [r9, #2]
+
+ add r9, r9, #3 ; cx_data += 3
+ str r9, [sp, #24]
+
+skip_write_partition
+
+ ldr r11, [sp, #28] ; i
+ ldr r10, [sp, #20] ; num_parts
+
+ add r11, r11, #1 ; i++
+ str r11, [sp, #28]
+
+ ldr r7, [sp, #32] ; cpi->tp_list[i]
+ mov r1, #TOKENLIST_SZ
+ add r7, r7, r1 ; next element in cpi->tp_list
+ str r7, [sp, #32] ; cpi->tp_list[i+1]
+
+ cmp r10, r11
+ bgt numparts_loop
+
+
+ add sp, sp, #44
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+_VP8_COMP_bc2_
+ DCD vp8_comp_bc2
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -1,0 +1,224 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_armv6|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *b
+; r1 BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+ stmfd sp!, {r1, r4-r11, lr}
+
+ ldr r3, [r0, #vp8_block_coeff] ; coeff
+ ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
+ ldr r5, [r0, #vp8_block_round] ; round
+ ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
+ ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
+ ldr r8, [r1, #vp8_blockd_dequant] ; dequant
+
+ ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
+ ; is used to update the counter so that
+ ; it can be used to mark nonzero
+ ; quantized coefficient pairs.
+
+ mov r1, #0 ; flags for quantized coeffs
+
+ ; PART 1: quantization and dequantization loop
+loop
+ ldr r9, [r3], #4 ; [z1 | z0]
+ ldr r10, [r5], #4 ; [r1 | r0]
+ ldr r11, [r4], #4 ; [q1 | q0]
+
+ ssat16 lr, #1, r9 ; [sz1 | sz0]
+ eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
+ ssub16 r9, r9, lr ; x = (z ^ sz) - sz
+ sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
+
+ ldr r12, [r3], #4 ; [z3 | z2]
+
+ smulbb r0, r9, r11 ; [(x0+r0)*q0]
+ smultt r9, r9, r11 ; [(x1+r1)*q1]
+
+ ldr r10, [r5], #4 ; [r3 | r2]
+
+ ssat16 r11, #1, r12 ; [sz3 | sz2]
+ eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
+ pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
+ ldr r9, [r4], #4 ; [q3 | q2]
+ ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
+
+ sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
+
+ eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+ smulbb r10, r12, r9 ; [(x2+r2)*q2]
+ smultt r12, r12, r9 ; [(x3+r3)*q3]
+
+ ssub16 r0, r0, lr ; x = (y ^ sz) - sz
+
+ cmp r0, #0 ; check if zero
+ orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
+
+ str r0, [r6], #4 ; *qcoeff++ = x
+ ldr r9, [r8], #4 ; [dq1 | dq0]
+
+ pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
+ eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
+ ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
+
+ cmp r10, #0 ; check if zero
+ orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
+
+ str r10, [r6], #4 ; *qcoeff++ = x
+ ldr r11, [r8], #4 ; [dq3 | dq2]
+
+ smulbb r12, r0, r9 ; [x0*dq0]
+ smultt r0, r0, r9 ; [x1*dq1]
+
+ smulbb r9, r10, r11 ; [x2*dq2]
+ smultt r10, r10, r11 ; [x3*dq3]
+
+ lsls r2, r2, #2 ; update loop counter
+ strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
+ strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
+ strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
+ strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
+ add r7, r7, #8 ; dqcoeff += 8
+ bne loop
+
+ ; PART 2: check position for eob...
+ mov lr, #0 ; init eob
+ cmp r1, #0 ; coeffs after quantization?
+ ldr r11, [sp, #0] ; restore BLOCKD pointer
+ beq end ; skip eob calculations if all zero
+
+ ldr r0, [r11, #vp8_blockd_qcoeff]
+
+ ; check shortcut for nonzero qcoeffs
+ tst r1, #0x80
+ bne quant_coeff_15_14
+ tst r1, #0x20
+ bne quant_coeff_13_11
+ tst r1, #0x8
+ bne quant_coeff_12_7
+ tst r1, #0x40
+ bne quant_coeff_10_9
+ tst r1, #0x10
+ bne quant_coeff_8_3
+ tst r1, #0x2
+ bne quant_coeff_6_5
+ tst r1, #0x4
+ bne quant_coeff_4_2
+ b quant_coeff_1_0
+
+quant_coeff_15_14
+ ldrh r2, [r0, #30] ; rc=15, i=15
+ mov lr, #16
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #28] ; rc=14, i=14
+ mov lr, #15
+ cmp r3, #0
+ bne end
+
+quant_coeff_13_11
+ ldrh r2, [r0, #22] ; rc=11, i=13
+ mov lr, #14
+ cmp r2, #0
+ bne end
+
+quant_coeff_12_7
+ ldrh r3, [r0, #14] ; rc=7, i=12
+ mov lr, #13
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #20] ; rc=10, i=11
+ mov lr, #12
+ cmp r2, #0
+ bne end
+
+quant_coeff_10_9
+ ldrh r3, [r0, #26] ; rc=13, i=10
+ mov lr, #11
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #24] ; rc=12, i=9
+ mov lr, #10
+ cmp r2, #0
+ bne end
+
+quant_coeff_8_3
+ ldrh r3, [r0, #18] ; rc=9, i=8
+ mov lr, #9
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #12] ; rc=6, i=7
+ mov lr, #8
+ cmp r2, #0
+ bne end
+
+quant_coeff_6_5
+ ldrh r3, [r0, #6] ; rc=3, i=6
+ mov lr, #7
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #4] ; rc=2, i=5
+ mov lr, #6
+ cmp r2, #0
+ bne end
+
+quant_coeff_4_2
+ ldrh r3, [r0, #10] ; rc=5, i=4
+ mov lr, #5
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #16] ; rc=8, i=3
+ mov lr, #4
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #8] ; rc=4, i=2
+ mov lr, #3
+ cmp r3, #0
+ bne end
+
+quant_coeff_1_0
+ ldrh r2, [r0, #2] ; rc=1, i=1
+ mov lr, #2
+ cmp r2, #0
+ bne end
+
+ mov lr, #1 ; rc=0, i=0
+
+end
+ str lr, [r11, #vp8_blockd_eob]
+ ldmfd sp!, {r1, r4-r11, pc}
+
+ ENDP
+
+loop_count
+ DCD 0x1000000
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -1,0 +1,138 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+ push {r4-r9, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r4, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm
@@ -1,0 +1,96 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 const unsigned char *src_ptr
+; r1 int src_stride
+; r2 const unsigned char *ref_ptr
+; r3 int ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ mov r4, #0 ; sad = 0;
+ mov r5, #8 ; loop count
+
+loop
+ ; 1st row
+ ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
+
+ usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
+ add r4, r4, r8 ; add partial sad values
+
+ ; 2nd row
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
+
+ usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ subs r5, r5, #1 ; decrement loop counter
+ add r4, r4, r8 ; add partial sad values
+
+ bne loop
+
+ mov r0, r4 ; return sad
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -1,0 +1,262 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_fdct4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+ stmfd sp!, {r4 - r12, lr}
+
+ ; PART 1
+
+ ; coeffs 0-3
+ ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
+
+ ldr r10, c7500
+ ldr r11, c14500
+ ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
+ ldr lr, c0x00080008
+ ror r5, r5, #16 ; [i2 | i3]
+
+ qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
+ smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+ smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
+
+ pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
+
+ str r6, [r1, #4]
+
+ ; coeffs 4-7
+ ror r9, r9, #16 ; [i6 | i7]
+
+ qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+ qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
+ smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+ smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
+
+ pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
+
+ str r6, [r1, #12]
+
+ ; coeffs 8-11
+ ror r5, r5, #16 ; [i10 | i11]
+
+ qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
+ smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+ smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
+
+ pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
+
+ str r6, [r1, #20]
+
+ ; coeffs 12-15
+ ror r5, r5, #16 ; [i14 | i15]
+
+ qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
+ qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
+ smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+ smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
+
+ str r6, [r1, #28]
+
+
+ ; PART 2 -------------------------------------------------
+ ldr r11, c12000
+ ldr r10, c51000
+ ldr lr, c0x00070007
+
+ qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
+ qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
+ qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
+ qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ add r0, r11, #0x10000 ; add (d!=0)
+
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ ldr r12, c0x08a914e8 ; [2217 | 5352]
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #0] ; [ o1 | o0]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #16] ; [ o9 | o8]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ ldr r3, [r1, #4] ; [i3 | i2]
+
+ pkhtb r5, r5, r4, asr #16 ; [o13|o12]
+
+ str r9, [r1, #8] ; [o5 | 04]
+
+ ldr r9, [r1, #12] ; [i7 | i6]
+ ldr r8, [r1, #28] ; [i15|i14]
+ ldr r2, [r1, #20] ; [i11|i10]
+ str r5, [r1, #24] ; [o13|o12]
+
+ qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
+ qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #4] ; [ o3 | o2]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #20] ; [ o11 | o10]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ str r9, [r1, #12] ; [o7 | o6]
+ pkhtb r5, r5, r4, asr #16 ; [o15|o14]
+
+ str r5, [r1, #28] ; [o15|o14]
+
+ ldmfd sp!, {r4 - r12, pc}
+
+ ENDP
+
+; Used constants
+c7500
+ DCD 7500
+c14500
+ DCD 14500
+c0x22a453a0
+ DCD 0x22a453a0
+c0x00080008
+ DCD 0x00080008
+c12000
+ DCD 12000
+c51000
+ DCD 51000
+c0x00070007
+ DCD 0x00070007
+c0x08a914e8
+ DCD 0x08a914e8
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -1,0 +1,265 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_subtract_mby_armv6|
+ EXPORT |vp8_subtract_mbuv_armv6|
+ EXPORT |vp8_subtract_b_armv6|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *be
+; r1 BLOCKD *bd
+; r2 int pitch
+|vp8_subtract_b_armv6| PROC
+
+ stmfd sp!, {r4-r9}
+
+ ldr r4, [r0, #vp8_block_base_src]
+ ldr r5, [r0, #vp8_block_src]
+ ldr r6, [r0, #vp8_block_src_diff]
+
+ ldr r3, [r4]
+ ldr r7, [r0, #vp8_block_src_stride]
+ add r3, r3, r5 ; src = *base_src + src
+ ldr r8, [r1, #vp8_blockd_predictor]
+
+ mov r9, #4 ; loop count
+
+loop_block
+
+ ldr r0, [r3], r7 ; src
+ ldr r1, [r8], r2 ; pred
+
+ uxtb16 r4, r0 ; [s2 | s0]
+ uxtb16 r5, r1 ; [p2 | p0]
+ uxtb16 r0, r0, ror #8 ; [s3 | s1]
+ uxtb16 r1, r1, ror #8 ; [p3 | p1]
+
+ usub16 r4, r4, r5 ; [d2 | d0]
+ usub16 r5, r0, r1 ; [d3 | d1]
+
+ subs r9, r9, #1 ; decrement loop counter
+
+ pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
+ pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
+
+ str r0, [r6, #0] ; diff
+ str r1, [r6, #4] ; diff
+
+ add r6, r6, r2, lsl #1 ; update diff pointer
+ bne loop_block
+
+ ldmfd sp!, {r4-r9}
+ mov pc, lr
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *usrc
+; r2 unsigned char *vsrc
+; r3 unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ add r0, r0, #512 ; set *diff point to Cb
+ add r3, r3, #256 ; set *pred point to Cb
+
+ mov r4, #8 ; loop count
+ ldr r5, [sp, #40] ; stride
+
+ ; Subtract U block
+loop_u
+ ldr r6, [r1] ; src (A)
+ ldr r7, [r3], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; src (B)
+ ldr r11, [r3], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r1, r1, r5 ; update usrc pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_u
+
+ mov r4, #8 ; loop count
+
+ ; Subtract V block
+loop_v
+ ldr r6, [r2] ; src (A)
+ ldr r7, [r3], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r2, #4] ; src (B)
+ ldr r11, [r3], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r2, r2, r5 ; update vsrc pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_v
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *src
+; r2 unsigned char *pred
+; r3 int stride
+|vp8_subtract_mby_armv6| PROC
+
+ stmfd sp!, {r4-r11}
+
+ mov r4, #16
+loop
+ ldr r6, [r1] ; src (A)
+ ldr r7, [r2], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; src (B)
+ ldr r11, [r2], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ ldr r10, [r1, #8] ; src (C)
+ ldr r11, [r2], #4 ; pred (C)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ uxtb16 r8, r10 ; [s2 | s0] (C)
+ str r9, [r0], #4 ; diff (B)
+
+ uxtb16 r9, r11 ; [p2 | p0] (C)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (C)
+ usub16 r7, r10, r11 ; [d3 | d1] (C)
+
+ ldr r10, [r1, #12] ; src (D)
+ ldr r11, [r2], #4 ; pred (D)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
+
+ str r8, [r0], #4 ; diff (C)
+ uxtb16 r8, r10 ; [s2 | s0] (D)
+ str r9, [r0], #4 ; diff (C)
+
+ uxtb16 r9, r11 ; [p2 | p0] (D)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (D)
+ usub16 r7, r10, r11 ; [d3 | d1] (D)
+
+ add r1, r1, r3 ; update src pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+ str r8, [r0], #4 ; diff (D)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (D)
+
+ bne loop
+
+ ldmfd sp!, {r4-r11}
+ mov pc, lr
+
+ ENDP
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -1,0 +1,154 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance16x16_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -1,0 +1,101 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance8x8_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance8x8_armv6| PROC
+
+ push {r4-r10, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #8 ; set loop counter to 8 (=block height)
+ mov r4, #0 ; initialize sum = 0
+ mov r5, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r6, [r0, #0x0] ; load 4 src pixels
+ ldr r7, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r6, r7 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r0, #0x4] ; load 4 src pixels
+ ldr r7, [r2, #0x4] ; load 4 ref pixels
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r6, r7 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1 ; next row
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r8, [sp, #32] ; get address of sse
+ mul r1, r4, r4 ; sum * sum
+ str r5, [r8] ; store sse
+ sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -1,0 +1,182 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_h_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -1,0 +1,222 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_hv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -1,0 +1,184 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_v_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/armv6/walsh_v6.asm
@@ -1,0 +1,212 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_walsh4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
+|vp8_short_walsh4x4_armv6| PROC
+
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrd r4, r5, [r0], r2
+ ldr lr, c00040004
+ ldrd r6, r7, [r0], r2
+
+ ; 0-3
+ qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
+ qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
+
+ ldrd r8, r9, [r0], r2
+ ; 4-7
+ qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
+ qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
+
+ ldrd r10, r11, [r0]
+ ; 8-11
+ qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
+ qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
+
+ ; 12-15
+ qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
+ qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
+
+
+ lsls r2, r3, #16
+ smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
+ addne r11, r11, #1 ; A0 += (a1!=0)
+
+ lsls r2, r7, #16
+ smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; C0 += (a1!=0)
+
+ add r0, r11, r12 ; a1_0 = A0 + C0
+ sub r11, r11, r12 ; b1_0 = A0 - C0
+
+ lsls r2, r5, #16
+ smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; B0 += (a1!=0)
+
+ lsls r2, r9, #16
+ smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
+ addne r2, r2, #1 ; D0 += (a1!=0)
+
+ add lr, r12, r2 ; d1_0 = B0 + D0
+ sub r12, r12, r2 ; c1_0 = B0 - D0
+
+ ; op[0,4,8,12]
+ adds r2, r0, lr ; a2 = a1_0 + d1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r0, lr ; d2 = a1_0 - d1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1] ; op[0]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ ldr lr, c00040004
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #24] ; op[12]
+
+ adds r2, r11, r12 ; b2 = b1_0 + c1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r11, r12 ; c2 = b1_0 - c1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #8] ; op[4]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
+ smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #16] ; op[8]
+
+
+ ; op[3,7,11,15]
+ add r0, r3, r7 ; a1_3 = A3 + C3
+ sub r3, r3, r7 ; b1_3 = A3 - C3
+
+ smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
+ smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
+ add r7, r5, r9 ; d1_3 = B3 + D3
+ sub r5, r5, r9 ; c1_3 = B3 - D3
+
+ adds r2, r0, r7 ; a2 = a1_3 + d1_3
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r5 ; b2 = b1_3 + c1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #6] ; op[3]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r5 ; c2 = b1_3 - c1_3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #14] ; op[7]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r7 ; d2 = a1_3 - d1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #22] ; op[11]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
+ smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #30] ; op[15]
+
+ ; op[1,5,9,13]
+ add r0, r3, r5 ; a1_1 = A1 + C1
+ sub r3, r3, r5 ; b1_1 = A1 - C1
+
+ smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
+ smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
+ add r5, r7, r9 ; d1_1 = B1 + D1
+ sub r7, r7, r9 ; c1_1 = B1 - D1
+
+ adds r2, r0, r5 ; a2 = a1_1 + d1_1
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r7 ; b2 = b1_1 + c1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #2] ; op[1]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r7 ; c2 = b1_1 - c1_1
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #10] ; op[5]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r5 ; d2 = a1_1 - d1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #18] ; op[9]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
+ smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #26] ; op[13]
+
+
+ ; op[2,6,10,14]
+ add r11, r4, r8 ; a1_2 = A2 + C2
+ sub r12, r4, r8 ; b1_2 = A2 - C2
+
+ smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
+ smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
+ add r4, r6, r10 ; d1_2 = B2 + D2
+ sub r8, r6, r10 ; c1_2 = B2 - D2
+
+ adds r2, r11, r4 ; a2 = a1_2 + d1_2
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r12, r8 ; b2 = b1_2 + c1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #4] ; op[2]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r12, r8 ; c2 = b1_2 - c1_2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #12] ; op[6]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r11, r4 ; d2 = a1_2 - d1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #20] ; op[10]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #28] ; op[14]
+
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+ DCD 0x00040004
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/boolhuff_arm.c
@@ -1,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/boolhuff.h"
+#include "vp9/common/blockd.h"
+
+const unsigned int vp9_prob_cost[256] = {
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
--- /dev/null
+++ b/vp9/encoder/arm/dct_arm.c
@@ -1,0 +1,21 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "./vpx_rtcd.h"
+
+#if HAVE_ARMV6
+
+void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
+ vp9_short_fdct4x4_armv6(input, output, pitch);
+ vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
--- /dev/null
+++ b/vp9/encoder/arm/dct_arm.h
@@ -1,0 +1,65 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DCT_ARM_H
+#define DCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_fdct(vp9_short_walsh4x4_armv6);
+extern prototype_fdct(vp9_short_fdct4x4_armv6);
+extern prototype_fdct(vp9_short_fdct8x4_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
+
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
+
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+extern prototype_fdct(vp9_short_fdct4x4_neon);
+extern prototype_fdct(vp9_short_fdct8x4_neon);
+extern prototype_fdct(vp8_fast_fdct4x4_neon);
+extern prototype_fdct(vp8_fast_fdct8x4_neon);
+extern prototype_fdct(vp9_short_walsh4x4_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
+
+#undef vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
+
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
+
+#undef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/encodemb_arm.h
@@ -1,0 +1,64 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_ARM_H
+#define ENCODEMB_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_subb(vp9_subtract_b_armv6);
+extern prototype_submby(vp9_subtract_mby_armv6);
+extern prototype_submbuv(vp9_subtract_mbuv_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp9_subtract_b_armv6
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp9_subtract_mby_armv6
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+#if HAVE_ARMV7
+// extern prototype_berr(vp9_block_error_c);
+// extern prototype_mberr(vp9_mbblock_error_c);
+// extern prototype_mbuverr(vp9_mbuverror_c);
+
+extern prototype_subb(vp9_subtract_b_neon);
+extern prototype_submby(vp9_subtract_mby_neon);
+extern prototype_submbuv(vp9_subtract_mbuv_neon);
+
+// #undef vp8_encodemb_berr
+// #define vp8_encodemb_berr vp9_block_error_c
+
+// #undef vp8_encodemb_mberr
+// #define vp8_encodemb_mberr vp9_mbblock_error_c
+
+// #undef vp8_encodemb_mbuverr
+// #define vp8_encodemb_mbuverr vp9_mbuverror_c
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_encodemb_subb
+#define vp8_encodemb_subb vp9_subtract_b_neon
+
+#undef vp8_encodemb_submby
+#define vp8_encodemb_submby vp9_subtract_mby_neon
+
+#undef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/neon/fastquantizeb_neon.asm
@@ -1,0 +1,261 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_neon|
+ EXPORT |vp8_fast_quantize_b_pair_neon|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=4
+
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+ stmfd sp!, {r4-r9}
+ vstmdb sp!, {q4-q7}
+
+ ldr r4, [r0, #vp8_block_coeff]
+ ldr r5, [r0, #vp8_block_quant_fast]
+ ldr r6, [r0, #vp8_block_round]
+
+ vld1.16 {q0, q1}, [r4@128] ; load z
+
+ ldr r7, [r2, #vp8_blockd_qcoeff]
+
+ vabs.s16 q4, q0 ; calculate x = abs(z)
+ vabs.s16 q5, q1
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
+
+ ldr r4, [r1, #vp8_block_coeff]
+
+ vadd.s16 q4, q6 ; x + Round
+ vadd.s16 q5, q7
+
+ vld1.16 {q0, q1}, [r4@128] ; load z2
+
+ vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q5, q9
+
+ vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
+ vabs.s16 q11, q1
+ vshr.s16 q12, q0, #15 ; sz2
+ vshr.s16 q13, q1, #15
+
+ ;modify data to have its original sign
+ veor.s16 q4, q2 ; y^sz
+ veor.s16 q5, q3
+
+ vadd.s16 q10, q6 ; x2 + Round
+ vadd.s16 q11, q7
+
+ ldr r8, [r2, #vp8_blockd_dequant]
+
+ vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q11, q9
+
+ vshr.s16 q4, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q5, #1
+
+ vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
+
+ vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q5, q3
+
+ vshr.s16 q10, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q11, #1
+
+ ldr r9, [r2, #vp8_blockd_dqcoeff]
+
+ veor.s16 q10, q12 ; y2^sz2
+ veor.s16 q11, q13
+
+ vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
+
+
+ vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q11, q13
+
+ ldr r6, [r3, #vp8_blockd_qcoeff]
+
+ vmul.s16 q2, q6, q4 ; x * Dequant
+ vmul.s16 q3, q7, q5
+
+ ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
+
+ vceq.s16 q8, q8 ; set q8 to all 1
+
+ vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
+
+ vmul.s16 q12, q6, q10 ; x2 * Dequant
+ vmul.s16 q13, q7, q11
+
+ vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
+
+ vtst.16 q14, q4, q8 ; now find eob
+ vtst.16 q15, q5, q8 ; non-zero element is set to all 1
+
+ vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
+
+ ldr r7, [r3, #vp8_blockd_dqcoeff]
+
+ vand q0, q6, q14 ; get all valid numbers from scan array
+ vand q1, q7, q15
+
+ vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
+
+ vtst.16 q2, q10, q8 ; now find eob
+ vtst.16 q3, q11, q8 ; non-zero element is set to all 1
+
+ vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
+
+ vand q10, q6, q2 ; get all valid numbers from scan array
+ vand q11, q7, q3
+ vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
+
+ vmax.u16 d0, d0, d1
+ vmax.u16 d20, d20, d21
+ vmovl.u16 q0, d0
+ vmovl.u16 q10, d20
+
+
+ vmax.u32 d0, d0, d1
+ vmax.u32 d20, d20, d21
+ vpmax.u32 d0, d0, d0
+ vpmax.u32 d20, d20, d20
+
+ add r4, r2, #vp8_blockd_eob
+ add r5, r3, #vp8_blockd_eob
+
+ vst1.32 {d0[0]}, [r4@32]
+ vst1.32 {d20[0]}, [r5@32]
+
+ vldmia sp!, {q4-q7}
+ ldmfd sp!, {r4-r9}
+ bx lr
+
+ ENDP
+
+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+|vp8_fast_quantize_b_neon| PROC
+
+ stmfd sp!, {r4-r7}
+
+ ldr r3, [r0, #vp8_block_coeff]
+ ldr r4, [r0, #vp8_block_quant_fast]
+ ldr r5, [r0, #vp8_block_round]
+
+ vld1.16 {q0, q1}, [r3@128] ; load z
+ vorr.s16 q14, q0, q1 ; check if all zero (step 1)
+ ldr r6, [r1, #vp8_blockd_qcoeff]
+ ldr r7, [r1, #vp8_blockd_dqcoeff]
+ vorr.s16 d28, d28, d29 ; check if all zero (step 2)
+
+ vabs.s16 q12, q0 ; calculate x = abs(z)
+ vabs.s16 q13, q1
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vmov r2, r3, d28 ; check if all zero (step 3)
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
+
+ vadd.s16 q12, q14 ; x + Round
+ vadd.s16 q13, q15
+
+ ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
+
+ vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q13, q9
+
+ vld1.16 {q10, q11}, [r0@128]; load inverse scan order
+
+ vceq.s16 q8, q8 ; set q8 to all 1
+
+ ldr r4, [r1, #vp8_blockd_dequant]
+
+ vshr.s16 q12, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q13, #1
+
+ orr r2, r2, r3 ; check if all zero (step 4)
+ cmp r2, #0 ; check if all zero (step 5)
+ beq zero_output ; check if all zero (step 6)
+
+ ;modify data to have its original sign
+ veor.s16 q12, q2 ; y^sz
+ veor.s16 q13, q3
+
+ vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q13, q3
+
+ vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
+
+ vtst.16 q14, q12, q8 ; now find eob
+ vtst.16 q15, q13, q8 ; non-zero element is set to all 1
+
+ vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
+
+ vand q10, q10, q14 ; get all valid numbers from scan array
+ vand q11, q11, q15
+
+
+ vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
+ vmax.u16 d0, d0, d1
+ vmovl.u16 q0, d0
+
+ vmul.s16 q2, q12 ; x * Dequant
+ vmul.s16 q3, q13
+
+ vmax.u32 d0, d0, d1
+ vpmax.u32 d0, d0, d0
+
+ vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
+
+ add r4, r1, #vp8_blockd_eob
+ vst1.32 {d0[0]}, [r4@32]
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+zero_output
+ str r2, [r1, #vp8_blockd_eob]
+ vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
+ vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+ ENDP
+
+; default inverse zigzag table is defined in vp9/common/entropy.c
+_inv_zig_zag_
+ DCD inv_zig_zag
+
+ ALIGN 16 ; enable use of @128 bit aligned loads
+inv_zig_zag
+ DCW 0x0001, 0x0002, 0x0006, 0x0007
+ DCW 0x0003, 0x0005, 0x0008, 0x000d
+ DCW 0x0004, 0x0009, 0x000c, 0x000e
+ DCW 0x000a, 0x000b, 0x000f, 0x0010
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/neon/picklpf_arm.c
@@ -1,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "vp9/encoder/onyx_int.h"
+#include "vp9/encoder/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int border;
+ int yoffset;
+ int linestocopy;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ linestocopy = (yheight >> (Fraction + 4));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+ yoffset = ystride * ((yheight >> 5) * 16 - 8);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+ vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
+}
--- /dev/null
+++ b/vp9/encoder/arm/neon/sad16_neon.asm
@@ -1,0 +1,207 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_neon|
+ EXPORT |vp8_sad16x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int src_stride
+; r2 unsigned char *ref_ptr
+; r3 int ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0]
+ vld1.8 {q7}, [r2]
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+|vp8_sad16x8_neon| PROC
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/sad8_neon.asm
@@ -1,0 +1,209 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad8x8_neon|
+ EXPORT |vp8_sad8x16_neon|
+ EXPORT |vp8_sad4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x8_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x16_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad4x4_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 d1, d24
+ vpaddl.u32 d0, d1
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/shortfdct_neon.asm
@@ -1,0 +1,221 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_fdct4x4_neon|
+ EXPORT |vp8_short_fdct8x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=4
+
+
+ ALIGN 16 ; enable use of @128 bit aligned loads
+coeff
+ DCW 5352, 5352, 5352, 5352
+ DCW 2217, 2217, 2217, 2217
+ DCD 14500, 14500, 14500, 14500
+ DCD 7500, 7500, 7500, 7500
+ DCD 12000, 12000, 12000, 12000
+ DCD 51000, 51000, 51000, 51000
+
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_neon| PROC
+
+ ; Part one
+ vld1.16 {d0}, [r0@64], r2
+ adr r12, coeff
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {d2}, [r0@64], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {d3}, [r0@64], r2
+
+ ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
+ vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
+ vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
+
+ vshl.s16 q2, q2, #3 ; (a1, b1) << 3
+ vshl.s16 q3, q3, #3 ; (c1, d1) << 3
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
+ vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
+
+ vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
+ vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
+ vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+
+ ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vmov.s16 d26, #7
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
+ vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
+ vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
+ vadd.s16 d4, d4, d26 ; a1 + 7
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
+ vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
+
+ vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
+ vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
+
+ vceq.s16 d4, d7, #0
+
+ vshr.s16 d0, d0, #4
+ vshr.s16 d2, d2, #4
+
+ vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 d4, d4
+ vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
+ vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+ vst1.16 {q0, q1}, [r1@128]
+
+ bx lr
+
+ ENDP
+
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct8x4_neon| PROC
+
+ ; Part one
+
+ vld1.16 {q0}, [r0@128], r2
+ adr r12, coeff
+ vld1.16 {q1}, [r0@128], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {q2}, [r0@128], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {q3}, [r0@128], r2
+
+ ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+ vtrn.32 q0, q2 ; [A0|B0]
+ vtrn.32 q1, q3 ; [A1|B1]
+ vtrn.16 q0, q1 ; [A2|B2]
+ vtrn.16 q2, q3 ; [A3|B3]
+
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
+ vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
+ vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
+
+ vshl.s16 q11, q11, #3 ; a1 << 3
+ vshl.s16 q12, q12, #3 ; b1 << 3
+ vshl.s16 q13, q13, #3 ; c1 << 3
+ vshl.s16 q14, q14, #3 ; d1 << 3
+
+ vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
+ vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
+
+ vmov.s16 q11, q9 ; 14500
+ vmov.s16 q12, q10 ; 7500
+
+ vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
+ vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
+ vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
+ vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
+
+ vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
+ vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
+ vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+ vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
+
+ ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+ vtrn.32 q0, q2 ; q0=[A0 | B0]
+ vtrn.32 q1, q3 ; q1=[A4 | B4]
+ vtrn.16 q0, q1 ; q2=[A8 | B8]
+ vtrn.16 q2, q3 ; q3=[A12|B12]
+
+ vmov.s16 q15, #7
+
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
+ vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
+ vadd.s16 q11, q11, q15 ; a1 + 7
+ vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 q0, q11, q12 ; a1 + b1 + 7
+ vsub.s16 q1, q11, q12 ; a1 - b1 + 7
+
+ vmov.s16 q11, q9 ; 12000
+ vmov.s16 q12, q10 ; 51000
+
+ vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
+ vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
+
+
+ vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
+ vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
+ vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
+ vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
+
+ vceq.s16 q14, q14, #0
+
+ vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
+ vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 q14, q14
+
+ vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
+
+ vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
+
+ vst1.16 {q0, q1}, [r1@128]! ; block A
+ vst1.16 {q2, q3}, [r1@128]! ; block B
+
+ bx lr
+
+ ENDP
+
+ END
+
--- /dev/null
+++ b/vp9/encoder/arm/neon/subtract_neon.asm
@@ -1,0 +1,185 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_subtract_b_neon|
+ EXPORT |vp8_subtract_mby_neon|
+ EXPORT |vp8_subtract_mbuv_neon|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+ stmfd sp!, {r4-r7}
+
+ ldr r3, [r0, #vp8_block_base_src]
+ ldr r4, [r0, #vp8_block_src]
+ ldr r5, [r0, #vp8_block_src_diff]
+ ldr r3, [r3]
+ ldr r6, [r0, #vp8_block_src_stride]
+ add r3, r3, r4 ; src = *base_src + src
+ ldr r7, [r1, #vp8_blockd_predictor]
+
+ vld1.8 {d0}, [r3], r6 ;load src
+ vld1.8 {d1}, [r7], r2 ;load pred
+ vld1.8 {d2}, [r3], r6
+ vld1.8 {d3}, [r7], r2
+ vld1.8 {d4}, [r3], r6
+ vld1.8 {d5}, [r7], r2
+ vld1.8 {d6}, [r3], r6
+ vld1.8 {d7}, [r7], r2
+
+ vsubl.u8 q10, d0, d1
+ vsubl.u8 q11, d2, d3
+ vsubl.u8 q12, d4, d5
+ vsubl.u8 q13, d6, d7
+
+ mov r2, r2, lsl #1
+
+ vst1.16 {d20}, [r5], r2 ;store diff
+ vst1.16 {d22}, [r5], r2
+ vst1.16 {d24}, [r5], r2
+ vst1.16 {d26}, [r5], r2
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+ ENDP
+
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+|vp8_subtract_mby_neon| PROC
+ mov r12, #4
+
+subtract_mby_loop
+ vld1.8 {q0}, [r1], r3 ;load src
+ vld1.8 {q1}, [r2]! ;load pred
+ vld1.8 {q2}, [r1], r3
+ vld1.8 {q3}, [r2]!
+ vld1.8 {q4}, [r1], r3
+ vld1.8 {q5}, [r2]!
+ vld1.8 {q6}, [r1], r3
+ vld1.8 {q7}, [r2]!
+
+ vsubl.u8 q8, d0, d2
+ vsubl.u8 q9, d1, d3
+ vsubl.u8 q10, d4, d6
+ vsubl.u8 q11, d5, d7
+ vsubl.u8 q12, d8, d10
+ vsubl.u8 q13, d9, d11
+ vsubl.u8 q14, d12, d14
+ vsubl.u8 q15, d13, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+ subs r12, r12, #1
+ bne subtract_mby_loop
+
+ bx lr
+ ENDP
+
+;=================================
+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+|vp8_subtract_mbuv_neon| PROC
+ ldr r12, [sp]
+
+;u
+ add r0, r0, #512 ; short *udiff = diff + 256;
+ add r3, r3, #256 ; unsigned char *upred = pred + 256;
+
+ vld1.8 {d0}, [r1], r12 ;load src
+ vld1.8 {d1}, [r3]! ;load pred
+ vld1.8 {d2}, [r1], r12
+ vld1.8 {d3}, [r3]!
+ vld1.8 {d4}, [r1], r12
+ vld1.8 {d5}, [r3]!
+ vld1.8 {d6}, [r1], r12
+ vld1.8 {d7}, [r3]!
+ vld1.8 {d8}, [r1], r12
+ vld1.8 {d9}, [r3]!
+ vld1.8 {d10}, [r1], r12
+ vld1.8 {d11}, [r3]!
+ vld1.8 {d12}, [r1], r12
+ vld1.8 {d13}, [r3]!
+ vld1.8 {d14}, [r1], r12
+ vld1.8 {d15}, [r3]!
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+;v
+ vld1.8 {d0}, [r2], r12 ;load src
+ vld1.8 {d1}, [r3]! ;load pred
+ vld1.8 {d2}, [r2], r12
+ vld1.8 {d3}, [r3]!
+ vld1.8 {d4}, [r2], r12
+ vld1.8 {d5}, [r3]!
+ vld1.8 {d6}, [r2], r12
+ vld1.8 {d7}, [r3]!
+ vld1.8 {d8}, [r2], r12
+ vld1.8 {d9}, [r3]!
+ vld1.8 {d10}, [r2], r12
+ vld1.8 {d11}, [r3]!
+ vld1.8 {d12}, [r2], r12
+ vld1.8 {d13}, [r3]!
+ vld1.8 {d14}, [r2], r12
+ vld1.8 {d15}, [r3]!
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0]! ;store diff
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+
+ bx lr
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/variance_neon.asm
@@ -1,0 +1,276 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance16x16_neon|
+ EXPORT |vp9_variance16x8_neon|
+ EXPORT |vp9_variance8x16_neon|
+ EXPORT |vp9_variance8x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance16x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+ ;the results into the elements of the destination vector. The explanation
+ ;in ARM guide is wrong.
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ ;vmov.32 r0, d0[0] ;this instruction costs a lot
+ ;vmov.32 r1, d1[0]
+ ;mul r0, r0, r0
+ ;str r1, [r12]
+ ;sub r0, r1, r0, asr #8
+
+ ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
+ ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;================================
+;unsigned int vp9_variance16x8_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+|vp9_variance16x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #4
+
+variance16x8_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #7
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;=================================
+;unsigned int vp9_variance8x16_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+
+|vp9_variance8x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance8x16_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d2, d6
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+
+ bne variance8x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #7
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;==================================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance8x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+variance8x8_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.s32 d10, d10, #6
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -1,0 +1,68 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_memcpy_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+ ;pld [r1] ;preload pred data
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ mov r12, r2, lsr #8 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+ vld1.8 {q0, q1}, [r1]! ;load src data
+ subs r12, r12, #1
+ vld1.8 {q2, q3}, [r1]!
+ vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
+ vld1.8 {q4, q5}, [r1]!
+ vst1.8 {q2, q3}, [r0]!
+ vld1.8 {q6, q7}, [r1]!
+ vst1.8 {q4, q5}, [r0]!
+ vld1.8 {q8, q9}, [r1]!
+ vst1.8 {q6, q7}, [r0]!
+ vld1.8 {q10, q11}, [r1]!
+ vst1.8 {q8, q9}, [r0]!
+ vld1.8 {q12, q13}, [r1]!
+ vst1.8 {q10, q11}, [r0]!
+ vld1.8 {q14, q15}, [r1]!
+ vst1.8 {q12, q13}, [r0]!
+ vst1.8 {q14, q15}, [r0]!
+
+ ;pld [r1] ;preload pred data -- need to adjust for real device
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ bne memcpy_neon_loop
+
+ ands r3, r2, #0xff ;extra copy
+ beq done_copy_neon_loop
+
+extra_copy_neon_loop
+ vld1.8 {q0}, [r1]! ;load src data
+ subs r3, r3, #16
+ vst1.8 {q0}, [r0]!
+ bne extra_copy_neon_loop
+
+done_copy_neon_loop
+ bx lr
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -1,0 +1,116 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_neon|
+ EXPORT |vp8_get4x4sse_cs_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp9_variance().
+
+|vp8_mse16x16_neon| PROC
+ vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
+ vmov.i8 q8, #0
+ vmov.i8 q9, #0
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+mse16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmlal.s16 q7, d22, d22
+ vmlal.s16 q8, d23, d23
+
+ subs r12, r12, #1
+
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vmlal.s16 q7, d26, d26
+ vmlal.s16 q8, d27, d27
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne mse16x16_neon_loop
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vadd.u32 q10, q7, q9
+ vpaddl.u32 q1, q10
+ vadd.u64 d0, d2, d3
+
+ vst1.32 {d0[0]}, [r12]
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+
+;=============================
+; r0 unsigned char *src_ptr,
+; r1 int source_stride,
+; r2 unsigned char *ref_ptr,
+; r3 int recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmull.s16 q7, d22, d22
+ vmull.s16 q8, d24, d24
+ vmull.s16 q9, d26, d26
+ vmull.s16 q10, d28, d28
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+ vadd.u32 q9, q7, q9
+
+ vpaddl.u32 q1, q9
+ vadd.u64 d0, d2, d3
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -1,0 +1,103 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_walsh4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
+|vp8_short_walsh4x4_neon| PROC
+
+ vld1.16 {d0}, [r0@64], r2 ; load input
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {d2}, [r0@64], r2
+ vld1.16 {d3}, [r0@64]
+
+ ;First for-loop
+ ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+
+ vmov.s32 q15, #3 ; add 3 to all values
+
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
+ vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
+ vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
+ vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
+
+ vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
+ vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
+ vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
+ vceq.s16 d16, d4, #0 ; a1 == 0
+ vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
+
+ vadd.s16 d0, d4, d5 ; a1 + d1
+ vmvn d16, d16 ; a1 != 0
+ vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
+ vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
+ vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
+ vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
+
+ ;Second for-loop
+ ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d1, d3
+ vtrn.32 d0, d2
+ vtrn.16 d2, d3
+ vtrn.16 d0, d1
+
+ vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
+ vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
+ vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
+ vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
+
+ vadd.s32 q0, q8, q9 ; a2 = a1 + d1
+ vadd.s32 q1, q11, q10 ; b2 = b1 + c1
+ vsub.s32 q2, q11, q10 ; c2 = b1 - c1
+ vsub.s32 q3, q8, q9 ; d2 = a1 - d1
+
+ vclt.s32 q8, q0, #0
+ vclt.s32 q9, q1, #0
+ vclt.s32 q10, q2, #0
+ vclt.s32 q11, q3, #0
+
+ ; subtract -1 (or 0)
+ vsub.s32 q0, q0, q8 ; a2 += a2 < 0
+ vsub.s32 q1, q1, q9 ; b2 += b2 < 0
+ vsub.s32 q2, q2, q10 ; c2 += c2 < 0
+ vsub.s32 q3, q3, q11 ; d2 += d2 < 0
+
+ vadd.s32 q8, q0, q15 ; a2 + 3
+ vadd.s32 q9, q1, q15 ; b2 + 3
+ vadd.s32 q10, q2, q15 ; c2 + 3
+ vadd.s32 q11, q3, q15 ; d2 + 3
+
+ ; vrshrn? would add 1 << 3-1 = 2
+ vshrn.s32 d0, q8, #3
+ vshrn.s32 d1, q9, #3
+ vshrn.s32 d2, q10, #3
+ vshrn.s32 d3, q11, #3
+
+ vst1.16 {q0, q1}, [r1@128]
+
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -1,0 +1,425 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_sub_pixel_variance16x16_neon_func|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
+
+|vp9_sub_pixel_variance16x16_neon_func| PROC
+ push {r4-r6, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #16] ;load *dst_ptr from stack
+ ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
+ ldr r6, [sp, #24] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ sub sp, sp, #256
+ mov r3, sp
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5}, [r3]!
+ vst1.u8 {d6, d7}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_sp16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r3]! ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r3]!
+ vst1.u8 {d18, d19}, [r3]!
+ vst1.u8 {d20, d21}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+ mov r3, sp
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r3]!
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r12, #8
+
+sub_pixel_variance16x16_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q2}, [r4], r5
+ vld1.8 {q1}, [r3]!
+ vld1.8 {q3}, [r4], r5
+
+ vsubl.u8 q11, d0, d4 ;diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne sub_pixel_variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r6] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #528
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4-r6,pc}
+
+ ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -1,0 +1,572 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_h_neon|
+ EXPORT |vp9_variance_halfpixvar16x16_v_neon|
+ EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
+ EXPORT |vp9_sub_pixel_variance16x16s_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_h_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_h_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.8 {q11}, [r2], r3
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.8 {q12}, [r2], r3
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.8 {q13}, [r2], r3
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vext.8 q3, q2, q3, #1
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vld1.8 {q14}, [r2], r3
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+
+ vsubl.u8 q4, d0, d22 ;diff
+ vsubl.u8 q5, d1, d23
+ vsubl.u8 q6, d2, d24
+ vsubl.u8 q7, d3, d25
+ vsubl.u8 q0, d4, d26
+ vsubl.u8 q1, d5, d27
+ vsubl.u8 q2, d6, d28
+ vsubl.u8 q3, d7, d29
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_fpo16x16s_4_0_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_v_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_v_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+
+ vld1.u8 {q0}, [r0], r1 ;load src data
+ ldr lr, [sp, #4] ;load *sse from stack
+
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+ vld1.u8 {q2}, [r0], r1
+ vld1.8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+
+ vrhadd.u8 q0, q0, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q4
+ vrhadd.u8 q4, q4, q6
+ vrhadd.u8 q6, q6, q15
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+
+ vmov q0, q15
+
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_spo16x16s_0_4_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_hv_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_hv_neon| PROC
+ push {lr}
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q13, #0 ;q8 - sum
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+
+ vmov.i8 q14, #0 ;q9, q10 - sse
+ vmov.i8 q15, #0
+
+ mov r12, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vld1.8 {q5}, [r2], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.8 {q6}, [r2], r3
+ vrhadd.u8 q1, q1, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q3
+ vld1.8 {q8}, [r2], r3
+ vrhadd.u8 q3, q3, q4
+
+ vsubl.u8 q9, d0, d10 ;diff
+ vsubl.u8 q10, d1, d11
+ vsubl.u8 q11, d2, d12
+ vsubl.u8 q12, d3, d13
+
+ vsubl.u8 q0, d4, d14 ;diff
+ vsubl.u8 q1, d5, d15
+ vsubl.u8 q5, d6, d16
+ vsubl.u8 q6, d7, d17
+
+ vpadal.s16 q13, q9 ;sum
+ vmlal.s16 q14, d18, d18 ;sse
+ vmlal.s16 q15, d19, d19
+
+ vpadal.s16 q13, q10 ;sum
+ vmlal.s16 q14, d20, d20 ;sse
+ vmlal.s16 q15, d21, d21
+
+ vpadal.s16 q13, q11 ;sum
+ vmlal.s16 q14, d22, d22 ;sse
+ vmlal.s16 q15, d23, d23
+
+ vpadal.s16 q13, q12 ;sum
+ vmlal.s16 q14, d24, d24 ;sse
+ vmlal.s16 q15, d25, d25
+
+ subs r12, r12, #1
+
+ vpadal.s16 q13, q0 ;sum
+ vmlal.s16 q14, d0, d0 ;sse
+ vmlal.s16 q15, d1, d1
+
+ vpadal.s16 q13, q1 ;sum
+ vmlal.s16 q14, d2, d2 ;sse
+ vmlal.s16 q15, d3, d3
+
+ vpadal.s16 q13, q5 ;sum
+ vmlal.s16 q14, d10, d10 ;sse
+ vmlal.s16 q15, d11, d11
+
+ vmov q0, q4
+
+ vpadal.s16 q13, q6 ;sum
+ vmlal.s16 q14, d12, d12 ;sse
+ vmlal.s16 q15, d13, d13
+
+ bne vp8_filt16x16s_4_4_loop_neon
+
+ vadd.u32 q15, q14, q15 ;accumulate sse
+ vpaddl.s32 q0, q13 ;accumulate sum
+
+ vpaddl.u32 q1, q15
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;==============================
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp9_sub_pixel_variance16x16s_neon| PROC
+ push {r4, lr}
+
+ ldr r4, [sp, #8] ;load *dst_ptr from stack
+ ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #16] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16s_only
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq firstpass_bfilter16x16s_only
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ mov r3, sp
+ mov r2, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q1, q1, q2
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q3, q3, q4
+
+ subs r2, r2, #1
+ vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
+ vmov q0, q4
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+
+ bne vp8e_filt_blk2d_fp16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+ mov r2, #2 ;loop counter
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+ vext.8 q3, q2, q3, #1
+ vld1.u8 {d20, d21, d22, d23}, [r0], r1
+ vext.8 q5, q4, q5, #1
+ vld1.u8 {d24, d25, d26, d27}, [r0], r1
+ vext.8 q7, q6, q7, #1
+ vld1.u8 {d28, d29, d30, d31}, [r0], r1
+ vext.8 q9, q8, q9, #1
+ vext.8 q11, q10, q11, #1
+ vext.8 q13, q12, q13, #1
+ vext.8 q15, q14, q15, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+ vrhadd.u8 q5, q10, q11
+ vrhadd.u8 q6, q12, q13
+ vrhadd.u8 q7, q14, q15
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]!
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+
+ mov r2, #2 ;loop counter
+ vld1.u8 {d0, d1}, [r0], r1 ;load src data
+ mov r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+ vld1.u8 {d2, d3}, [r0], r1
+ vld1.u8 {d4, d5}, [r0], r1
+ vld1.u8 {d6, d7}, [r0], r1
+ vld1.u8 {d8, d9}, [r0], r1
+
+ vrhadd.u8 q0, q0, q1
+ vld1.u8 {d10, d11}, [r0], r1
+ vrhadd.u8 q1, q1, q2
+ vld1.u8 {d12, d13}, [r0], r1
+ vrhadd.u8 q2, q2, q3
+ vld1.u8 {d14, d15}, [r0], r1
+ vrhadd.u8 q3, q3, q4
+ vld1.u8 {d16, d17}, [r0], r1
+ vrhadd.u8 q4, q4, q5
+ vrhadd.u8 q5, q5, q6
+ vrhadd.u8 q6, q6, q7
+ vrhadd.u8 q7, q7, q8
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vmov q0, q8
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q1}, [r4], r12
+ vld1.8 {q2}, [r3]!
+ vld1.8 {q3}, [r4], r12
+ vld1.8 {q4}, [r3]!
+ vld1.8 {q5}, [r4], r12
+ vld1.8 {q6}, [r3]!
+ vld1.8 {q7}, [r4], r12
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r2, r2, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne sub_pixel_variance16x16s_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #256
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -1,0 +1,224 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_sub_pixel_variance8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
+
+|vp9_sub_pixel_variance8x8_neon| PROC
+ push {r4-r5, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #12] ;load *dst_ptr from stack
+ ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #20] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ ;skip_secondpass_filter
+ beq sub_pixel_variance8x8_neon
+
+ add r3, r12, r3, lsl #3
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q2, #7
+ vqrshrn.u16 d24, q3, #7
+ vqrshrn.u16 d25, q4, #7
+ vqrshrn.u16 d26, q5, #7
+ vqrshrn.u16 d27, q6, #7
+ vqrshrn.u16 d28, q7, #7
+ vqrshrn.u16 d29, q8, #7
+
+ b sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;----------------------
+;vp9_variance8x8_neon
+sub_pixel_variance8x8_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+sub_pixel_variance8x8_neon_loop
+ vld1.8 {d0}, [r4], r5 ;load dst data
+ subs r12, r12, #1
+ vld1.8 {d1}, [r4], r5
+ vld1.8 {d2}, [r4], r5
+ vsubl.u8 q4, d22, d0 ;calculate diff
+ vld1.8 {d3}, [r4], r5
+
+ vsubl.u8 q5, d23, d1
+ vsubl.u8 q6, d24, d2
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ vsubl.u8 q7, d25, d3
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+
+ vmov q11, q13
+
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+
+ vmov q12, q14
+
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ bne sub_pixel_variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #6
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {r4-r5, pc}
+
+ ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/quantize_arm.c
@@ -1,0 +1,59 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/encoder/quantize.h"
+#include "vp9/common/entropy.h"
+
+
+#if HAVE_ARMV7
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x) {
+ int i;
+ int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 16; i += 2)
+ x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+
+ if (has_2nd_order)
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x) {
+ int i;
+ int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 24; i += 2)
+ x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+
+ if (has_2nd_order)
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
+ int i;
+
+ for (i = 16; i < 24; i += 2)
+ x->quantize_b_pair(&x->block[i], &x->block[i + 1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
+}
+
+#endif /* HAVE_ARMV7 */
--- /dev/null
+++ b/vp9/encoder/arm/quantize_arm.h
@@ -1,0 +1,52 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef QUANTIZE_ARM_H
+#define QUANTIZE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+
+#undef vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
+
+#undef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_neon
+
+#undef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
+
+#undef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_neon
+#endif
+
+#endif /* HAVE_ARMV7 */
+
+#endif
+
--- /dev/null
+++ b/vp9/encoder/arm/variance_arm.c
@@ -1,0 +1,112 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/filter.h"
+#include "vp9/common/arm/bilinearfilter_arm.h"
+
+#define HALFNDX 8
+
+#if HAVE_ARMV6
+
+unsigned int vp9_sub_pixel_variance8x8_armv6
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ unsigned short first_pass[10 * 8];
+ unsigned char second_pass[8 * 8];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 9, 8, HFilter);
+ vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 8, 8, 8, VFilter);
+
+ return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,
+ dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_armv6
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ unsigned short first_pass[36 * 16];
+ unsigned char second_pass[20 * 16];
+ const short *HFilter, *VFilter;
+ unsigned int var;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ } else {
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 17, 16, HFilter);
+ vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 16, 16, 16, VFilter);
+
+ var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
+ }
+ return var;
+}
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+
+unsigned int vp9_sub_pixel_variance16x16_neon
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ if (xoffset == HALFNDX && yoffset == 0)
+ return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else if (xoffset == 0 && yoffset == HALFNDX)
+ return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else if (xoffset == HALFNDX && yoffset == HALFNDX)
+ return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else
+ return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
--- /dev/null
+++ b/vp9/encoder/arm/variance_arm.h
@@ -1,0 +1,132 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_ARM_H
+#define VARIANCE_ARM_H
+
+#if HAVE_ARMV6
+
+extern prototype_sad(vp9_sad16x16_armv6);
+extern prototype_variance(vp9_variance16x16_armv6);
+extern prototype_variance(vp9_variance8x8_armv6);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);
+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp9_mse16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_variance_sad16x16
+#define vp9_variance_sad16x16 vp9_sad16x16_armv6
+
+#undef vp9_variance_subpixvar16x16
+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6
+
+#undef vp9_variance_subpixvar8x8
+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6
+
+#undef vp9_variance_var16x16
+#define vp9_variance_var16x16 vp9_variance16x16_armv6
+
+#undef vp9_variance_mse16x16
+#define vp9_variance_mse16x16 vp9_mse16x16_armv6
+
+#undef vp9_variance_var8x8
+#define vp9_variance_var8x8 vp9_variance8x8_armv6
+
+#undef vp9_variance_halfpixvar16x16_h
+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6
+
+#undef vp9_variance_halfpixvar16x16_v
+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6
+
+#undef vp9_variance_halfpixvar16x16_hv
+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
+
+#if HAVE_ARMV7
+extern prototype_sad(vp9_sad4x4_neon);
+extern prototype_sad(vp9_sad8x8_neon);
+extern prototype_sad(vp9_sad8x16_neon);
+extern prototype_sad(vp9_sad16x8_neon);
+extern prototype_sad(vp9_sad16x16_neon);
+
+extern prototype_variance(vp9_variance8x8_neon);
+extern prototype_variance(vp9_variance8x16_neon);
+extern prototype_variance(vp9_variance16x8_neon);
+extern prototype_variance(vp9_variance16x16_neon);
+
+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);
+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);
+extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);
+extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);
+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);
+
+extern prototype_variance(vp9_mse16x16_neon);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef vp9_variance_sad4x4
+#define vp9_variance_sad4x4 vp9_sad4x4_neon
+
+#undef vp9_variance_sad8x8
+#define vp9_variance_sad8x8 vp9_sad8x8_neon
+
+#undef vp9_variance_sad8x16
+#define vp9_variance_sad8x16 vp9_sad8x16_neon
+
+#undef vp9_variance_sad16x8
+#define vp9_variance_sad16x8 vp9_sad16x8_neon
+
+#undef vp9_variance_sad16x16
+#define vp9_variance_sad16x16 vp9_sad16x16_neon
+
+#undef vp9_variance_var8x8
+#define vp9_variance_var8x8 vp9_variance8x8_neon
+
+#undef vp9_variance_var8x16
+#define vp9_variance_var8x16 vp9_variance8x16_neon
+
+#undef vp9_variance_var16x8
+#define vp9_variance_var16x8 vp9_variance16x8_neon
+
+#undef vp9_variance_var16x16
+#define vp9_variance_var16x16 vp9_variance16x16_neon
+
+#undef vp9_variance_subpixvar8x8
+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon
+
+#undef vp9_variance_subpixvar16x16
+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon
+
+#undef vp9_variance_halfpixvar16x16_h
+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon
+
+#undef vp9_variance_halfpixvar16x16_v
+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon
+
+#undef vp9_variance_halfpixvar16x16_hv
+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon
+
+#undef vp9_variance_mse16x16
+#define vp9_variance_mse16x16 vp9_mse16x16_neon
+
+#endif
+
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/asm_enc_offsets.c
@@ -1,0 +1,90 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_config.h"
+#include "block.h"
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+BEGIN
+
+/* regular quantize */
+DEFINE(vp9_block_coeff, offsetof(BLOCK, coeff));
+DEFINE(vp9_block_zbin, offsetof(BLOCK, zbin));
+DEFINE(vp9_block_round, offsetof(BLOCK, round));
+DEFINE(vp9_block_quant, offsetof(BLOCK, quant));
+DEFINE(vp9_block_quant_fast, offsetof(BLOCK, quant_fast));
+DEFINE(vp9_block_zbin_extra, offsetof(BLOCK, zbin_extra));
+DEFINE(vp9_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp9_block_quant_shift, offsetof(BLOCK, quant_shift));
+
+DEFINE(vp9_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
+DEFINE(vp9_blockd_dequant, offsetof(BLOCKD, dequant));
+DEFINE(vp9_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
+DEFINE(vp9_blockd_eob, offsetof(BLOCKD, eob));
+
+/* subtract */
+DEFINE(vp9_block_base_src, offsetof(BLOCK, base_src));
+DEFINE(vp9_block_src, offsetof(BLOCK, src));
+DEFINE(vp9_block_src_diff, offsetof(BLOCK, src_diff));
+DEFINE(vp9_block_src_stride, offsetof(BLOCK, src_stride));
+
+DEFINE(vp9_blockd_predictor, offsetof(BLOCKD, predictor));
+
+/* pack tokens */
+DEFINE(vp9_writer_lowvalue, offsetof(vp9_writer, lowvalue));
+DEFINE(vp9_writer_range, offsetof(vp9_writer, range));
+DEFINE(vp9_writer_value, offsetof(vp9_writer, value));
+DEFINE(vp9_writer_count, offsetof(vp9_writer, count));
+DEFINE(vp9_writer_pos, offsetof(vp9_writer, pos));
+DEFINE(vp9_writer_buffer, offsetof(vp9_writer, buffer));
+
+DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA));
+
+DEFINE(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct));
+
+DEFINE(vp9_token_value, offsetof(vp9_token, value));
+DEFINE(vp9_token_len, offsetof(vp9_token, Len));
+
+DEFINE(vp9_extra_bit_struct_tree, offsetof(vp9_extra_bit_struct, tree));
+DEFINE(vp9_extra_bit_struct_prob, offsetof(vp9_extra_bit_struct, prob));
+DEFINE(vp9_extra_bit_struct_len, offsetof(vp9_extra_bit_struct, Len));
+DEFINE(vp9_extra_bit_struct_base_val, offsetof(vp9_extra_bit_struct, base_val));
+
+DEFINE(vp9_comp_tplist, offsetof(VP9_COMP, tplist));
+DEFINE(vp9_comp_common, offsetof(VP9_COMP, common));
+
+DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
+
+DEFINE(vp9_common_mb_rows, offsetof(VP9_COMMON, mb_rows));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code
+ * add asserts for any size that is not supported by assembly code
+
+ * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes
+ * change they will have to be adjusted.
+ */
+
+#if HAVE_ARMV5TE
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)
+#endif
--- /dev/null
+++ b/vp9/encoder/bitstream.c
@@ -1,0 +1,2394 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/header.h"
+#include "encodemv.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp9/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vp9/common/pragmas.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+#include "segmentation.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vp9/encoder/encodemv.h"
+#include "vp9/common/entropymv.h"
+
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];
+unsigned int tree_update_hist [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES][2];
+unsigned int hybrid_tree_update_hist [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES][2];
+unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] [2];
+unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] [2];
+unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] [2];
+unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] [2];
+
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
+#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+#define SEARCH_NEWP
+static int update_bits[255];
+
+static void compute_update_table() {
+ int i;
+ for (i = 0; i < 255; i++)
+ update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
+}
+
+static int split_index(int i, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (i % modulus == modulus / 2) i = i / modulus;
+ else i = max1 + i - (i + modulus - modulus / 2) / modulus;
+ return i;
+}
+
+static int remap_prob(int v, int m) {
+ const int n = 256;
+ const int modulus = MODULUS_PARAM;
+ int i;
+ if ((m << 1) <= n)
+ i = vp9_recenter_nonneg(v, m) - 1;
+ else
+ i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
+
+ i = split_index(i, n - 1, modulus);
+ return i;
+}
+
+static void write_prob_diff_update(vp9_writer *const bc,
+ vp9_prob newp, vp9_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
+}
+
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ return update_bits[delp] * 256;
+}
+
+static void update_mode(
+ vp9_writer *const bc,
+ int n,
+ vp9_token tok [/* n */],
+ vp9_tree tree,
+ vp9_prob Pnew [/* n-1 */],
+ vp9_prob Pcur [/* n-1 */],
+ unsigned int bct [/* n-1 */] [2],
+ const unsigned int num_events[/* n */]
+) {
+ unsigned int new_b = 0, old_b = 0;
+ int i = 0;
+
+ vp9_tree_probs_from_distribution(
+ n--, tok, tree,
+ Pnew, bct, num_events,
+ 256, 1
+ );
+
+ do {
+ new_b += cost_branch(bct[i], Pnew[i]);
+ old_b += cost_branch(bct[i], Pcur[i]);
+ } while (++i < n);
+
+ if (new_b + (n << 8) < old_b) {
+ int i = 0;
+
+ vp9_write_bit(bc, 1);
+
+ do {
+ const vp9_prob p = Pnew[i];
+
+ vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
+ } while (++i < n);
+ } else
+ vp9_write_bit(bc, 0);
+}
+
+static void update_mbintra_mode_probs(VP9_COMP* const cpi,
+ vp9_writer* const bc) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ {
+ vp9_prob Pnew [VP9_YMODES - 1];
+ unsigned int bct [VP9_YMODES - 1] [2];
+
+ update_mode(
+ bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+ Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+ );
+ }
+}
+
+static int get_prob(int num, int den) {
+ int p;
+ if (den <= 0)
+ return 128;
+ p = (num * 255 + (den >> 1)) / den;
+ if (p > 255)
+ return 255;
+ else if (p < 1)
+ return 1;
+ return p;
+}
+
+static int get_binary_prob(int n0, int n1) {
+ return get_prob(n0, n0 + n1);
+}
+
+void vp9_update_skip_probs(VP9_COMP *cpi) {
+ VP9_COMMON *const pc = &cpi->common;
+ int prob_skip_false[3] = {0, 0, 0};
+ int k;
+
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+ pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
+ cpi->skip_true_count[k]);
+ }
+}
+
+static void update_switchable_interp_probs(VP9_COMP *cpi,
+ vp9_writer* const bc) {
+ VP9_COMMON *const pc = &cpi->common;
+ unsigned int branch_ct[32][2];
+ int i, j;
+ for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+ vp9_tree_probs_from_distribution(
+ VP9_SWITCHABLE_FILTERS,
+ vp9_switchable_interp_encodings, vp9_switchable_interp_tree,
+ pc->fc.switchable_interp_prob[j], branch_ct,
+ cpi->switchable_interp_count[j], 256, 1);
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
+ if (pc->fc.switchable_interp_prob[j][i] < 1)
+ pc->fc.switchable_interp_prob[j][i] = 1;
+ vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
+ }
+ }
+}
+
+// This function updates the reference frame prediction stats
+static void update_refpred_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int i;
+ int tot_count;
+ vp9_prob new_pred_probs[PREDICTION_PROBS];
+ int old_cost, new_cost;
+
+ // Set the prediction probability structures to defaults
+ if (cm->frame_type == KEY_FRAME) {
+ // Set the prediction probabilities to defaults
+ cm->ref_pred_probs[0] = 120;
+ cm->ref_pred_probs[1] = 80;
+ cm->ref_pred_probs[2] = 40;
+
+ vpx_memset(cpi->ref_pred_probs_update, 0,
+ sizeof(cpi->ref_pred_probs_update));
+ } else {
+ // From the prediction counts set the probabilities for each context
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
+ cpi->ref_pred_count[i][1]);
+
+ // Decide whether or not to update the reference frame probs.
+ // Returned costs are in 1/256 bit units.
+ old_cost =
+ (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +
+ (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));
+
+ new_cost =
+ (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +
+ (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));
+
+ // Cost saving must be >= 8 bits (2048 in these units)
+ if ((old_cost - new_cost) >= 2048) {
+ cpi->ref_pred_probs_update[i] = 1;
+ cm->ref_pred_probs[i] = new_pred_probs[i];
+ } else
+ cpi->ref_pred_probs_update[i] = 0;
+
+ }
+ }
+}
+
+static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,
+ int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ MV mv;
+
+ if (mbmi->mode == SPLITMV) {
+ int i;
+
+ for (i = 0; i < x->partition_info->count; i++) {
+ if (x->partition_info->bmi[i].mode == NEW4X4) {
+ if (x->e_mbd.allow_high_precision_mv) {
+ mv.row = (x->partition_info->bmi[i].mv.as_mv.row
+ - best_ref_mv->as_mv.row);
+ mv.col = (x->partition_info->bmi[i].mv.as_mv.col
+ - best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
+ if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
+ mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
+ - second_best_ref_mv->as_mv.row);
+ mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
+ - second_best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
+ &cpi->NMVcount, 1);
+ }
+ } else {
+ mv.row = (x->partition_info->bmi[i].mv.as_mv.row
+ - best_ref_mv->as_mv.row);
+ mv.col = (x->partition_info->bmi[i].mv.as_mv.col
+ - best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
+ if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {
+ mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
+ - second_best_ref_mv->as_mv.row);
+ mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
+ - second_best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
+ &cpi->NMVcount, 0);
+ }
+ }
+ }
+ }
+ } else if (mbmi->mode == NEWMV) {
+ if (x->e_mbd.allow_high_precision_mv) {
+ mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
+ mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
+ if (mbmi->second_ref_frame) {
+ mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
+ mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
+ }
+ } else {
+ mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
+ mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
+ if (mbmi->second_ref_frame) {
+ mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
+ mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
+ vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
+ }
+ }
+ }
+}
+
+static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
+}
+
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
+}
+#endif
+
+static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
+}
+
+static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
+ write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
+}
+
+static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
+ write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
+}
+
+static int prob_update_savings(const unsigned int *ct,
+ const vp9_prob oldp, const vp9_prob newp,
+ const vp9_prob upd) {
+ const int old_b = cost_branch256(ct, oldp);
+ const int new_b = cost_branch256(ct, newp);
+ const int update_b = 2048 + vp9_cost_upd256;
+ return (old_b - new_b - update_b);
+}
+
+static int prob_diff_update_savings(const unsigned int *ct,
+ const vp9_prob oldp, const vp9_prob newp,
+ const vp9_prob upd) {
+ const int old_b = cost_branch256(ct, oldp);
+ const int new_b = cost_branch256(ct, newp);
+ const int update_b = (newp == oldp ? 0 :
+ prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);
+ return (old_b - new_b - update_b);
+}
+
+static int prob_diff_update_savings_search(const unsigned int *ct,
+ const vp9_prob oldp, vp9_prob *bestp,
+ const vp9_prob upd) {
+ const int old_b = cost_branch256(ct, oldp);
+ int new_b, update_b, savings, bestsavings, step;
+ vp9_prob newp, bestnewp;
+
+ bestsavings = 0;
+ bestnewp = oldp;
+
+ step = (*bestp > oldp ? -1 : 1);
+ for (newp = *bestp; newp != oldp; newp += step) {
+ new_b = cost_branch256(ct, newp);
+ update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+static void pack_mb_tokens(vp9_writer* const bc,
+ TOKENEXTRA **tp,
+ const TOKENEXTRA *const stop) {
+ unsigned int split;
+ unsigned int shift;
+ int count = bc->count;
+ unsigned int range = bc->range;
+ unsigned int lowvalue = bc->lowvalue;
+ TOKENEXTRA *p = *tp;
+
+ while (p < stop) {
+ const int t = p->Token;
+ vp9_token *const a = vp9_coef_encodings + t;
+ const vp9_extra_bit_struct *const b = vp9_extra_bits + t;
+ int i = 0;
+ const unsigned char *pp = p->context_tree;
+ int v = a->value;
+ int n = a->Len;
+
+ if (t == EOSB_TOKEN)
+ {
+ ++p;
+ break;
+ }
+
+ /* skip one or two nodes */
+ if (p->skip_eob_node) {
+ n -= p->skip_eob_node;
+ i = 2 * p->skip_eob_node;
+ }
+
+ do {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+ i = vp9_coef_tree[i + bb];
+
+ if (bb) {
+ lowvalue += split;
+ range = range - split;
+ } else {
+ range = split;
+ }
+
+ shift = vp9_norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0) {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000) {
+ int x = bc->pos - 1;
+
+ while (x >= 0 && bc->buffer[x] == 0xff) {
+ bc->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ bc->buffer[x] += 1;
+ }
+
+ bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8;
+ }
+
+ lowvalue <<= shift;
+ } while (n);
+
+
+ if (b->base_val) {
+ const int e = p->Extra, L = b->Len;
+
+ if (L) {
+ const unsigned char *pp = b->prob;
+ int v = e >> 1;
+ int n = L; /* number of bits in v, assumed nonzero */
+ int i = 0;
+
+ do {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+ i = b->tree[i + bb];
+
+ if (bb) {
+ lowvalue += split;
+ range = range - split;
+ } else {
+ range = split;
+ }
+
+ shift = vp9_norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0) {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000) {
+ int x = bc->pos - 1;
+
+ while (x >= 0 && bc->buffer[x] == 0xff) {
+ bc->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ bc->buffer[x] += 1;
+ }
+
+ bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8;
+ }
+
+ lowvalue <<= shift;
+ } while (n);
+ }
+
+
+ {
+
+ split = (range + 1) >> 1;
+
+ if (e & 1) {
+ lowvalue += split;
+ range = range - split;
+ } else {
+ range = split;
+ }
+
+ range <<= 1;
+
+ if ((lowvalue & 0x80000000)) {
+ int x = bc->pos - 1;
+
+ while (x >= 0 && bc->buffer[x] == 0xff) {
+ bc->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ bc->buffer[x] += 1;
+
+ }
+
+ lowvalue <<= 1;
+
+ if (!++count) {
+ count = -8;
+ bc->buffer[bc->pos++] = (lowvalue >> 24);
+ lowvalue &= 0xffffff;
+ }
+ }
+
+ }
+ ++p;
+ }
+
+ bc->count = count;
+ bc->lowvalue = lowvalue;
+ bc->range = range;
+ *tp = p;
+}
+
+static void write_partition_size(unsigned char *cx_data, int size) {
+ signed char csize;
+
+ csize = size & 0xff;
+ *cx_data = csize;
+ csize = (size >> 8) & 0xff;
+ *(cx_data + 1) = csize;
+ csize = (size >> 16) & 0xff;
+ *(cx_data + 2) = csize;
+
+}
+
+static void write_mv_ref
+(
+ vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p
+) {
+#if CONFIG_DEBUG
+ assert(NEARESTMV <= m && m <= SPLITMV);
+#endif
+ write_token(bc, vp9_mv_ref_tree, p,
+ vp9_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
+ const vp9_prob *p) {
+#if CONFIG_DEBUG
+ assert(NEARESTMV <= m && m < SPLITMV);
+#endif
+ write_token(bc, vp9_sb_mv_ref_tree, p,
+ vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
+static void write_sub_mv_ref
+(
+ vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p
+) {
+#if CONFIG_DEBUG
+ assert(LEFT4X4 <= m && m <= NEW4X4);
+#endif
+ write_token(bc, vp9_sub_mv_ref_tree, p,
+ vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
+}
+
+static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,
+ const nmv_context *nmvc, int usehp) {
+ MV e;
+ e.row = mv->row - ref->as_mv.row;
+ e.col = mv->col - ref->as_mv.col;
+
+ vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);
+ vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
+}
+
+#if CONFIG_NEW_MVREF
+static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {
+ int cost;
+
+ // Encode the index for the MV reference.
+ switch (mv_ref_id) {
+ case 0:
+ cost = vp9_cost_zero(ref_id_probs[0]);
+ break;
+ case 1:
+ cost = vp9_cost_one(ref_id_probs[0]);
+ cost += vp9_cost_zero(ref_id_probs[1]);
+ break;
+ case 2:
+ cost = vp9_cost_one(ref_id_probs[0]);
+ cost += vp9_cost_one(ref_id_probs[1]);
+ cost += vp9_cost_zero(ref_id_probs[2]);
+ break;
+ case 3:
+ cost = vp9_cost_one(ref_id_probs[0]);
+ cost += vp9_cost_one(ref_id_probs[1]);
+ cost += vp9_cost_one(ref_id_probs[2]);
+ break;
+
+ // TRAP.. This should not happen
+ default:
+ assert(0);
+ break;
+ }
+
+ return cost;
+}
+
+static void vp9_write_mv_ref_id(vp9_writer *w,
+ vp9_prob * ref_id_probs,
+ int mv_ref_id) {
+ // Encode the index for the MV reference.
+ switch (mv_ref_id) {
+ case 0:
+ vp9_write(w, 0, ref_id_probs[0]);
+ break;
+ case 1:
+ vp9_write(w, 1, ref_id_probs[0]);
+ vp9_write(w, 0, ref_id_probs[1]);
+ break;
+ case 2:
+ vp9_write(w, 1, ref_id_probs[0]);
+ vp9_write(w, 1, ref_id_probs[1]);
+ vp9_write(w, 0, ref_id_probs[2]);
+ break;
+ case 3:
+ vp9_write(w, 1, ref_id_probs[0]);
+ vp9_write(w, 1, ref_id_probs[1]);
+ vp9_write(w, 1, ref_id_probs[2]);
+ break;
+
+ // TRAP.. This should not happen
+ default:
+ assert(0);
+ break;
+ }
+}
+
+// Estimate the cost of each coding the vector using each reference candidate
+static unsigned int pick_best_mv_ref(MACROBLOCK *x,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv target_mv,
+ int_mv * mv_ref_list,
+ int_mv * best_ref) {
+ int i;
+ int best_index = 0;
+ int cost, cost2;
+ int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int max_mv = MV_MAX;
+
+ cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +
+ vp9_mv_bit_cost(&target_mv,
+ &mv_ref_list[0],
+ XMVCOST, 96,
+ xd->allow_high_precision_mv);
+
+
+ // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {
+ for (i = 1; i < 4; ++i) {
+ // If we see a 0,0 reference vector for a second time we have reached
+ // the end of the list of valid candidate vectors.
+ if (!mv_ref_list[i].as_int)
+ if (zero_seen)
+ break;
+ else
+ zero_seen = TRUE;
+
+ // Check for cases where the reference choice would give rise to an
+ // uncodable/out of range residual for row or col.
+ if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||
+ (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {
+ continue;
+ }
+
+ cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +
+ vp9_mv_bit_cost(&target_mv,
+ &mv_ref_list[i],
+ XMVCOST, 96,
+ xd->allow_high_precision_mv);
+
+ if (cost2 < cost) {
+ cost = cost2;
+ best_index = i;
+ }
+ }
+
+ (*best_ref).as_int = mv_ref_list[best_index].as_int;
+
+ return best_index;
+}
+#endif
+
+// This function writes the current macro block's segnment id to the bitstream
+// It should only be called if a segment map update is indicated.
+static void write_mb_segid(vp9_writer *bc,
+ const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
+ // Encode the MB segment id.
+ int seg_id = mi->segment_id;
+#if CONFIG_SUPERBLOCKS
+ if (mi->encoded_as_sb) {
+ if (xd->mb_to_right_edge > 0)
+ seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
+ if (xd->mb_to_bottom_edge > 0) {
+ seg_id = seg_id &&
+ xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
+ if (xd->mb_to_right_edge > 0)
+ seg_id = seg_id &&
+ xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
+ }
+ }
+#endif
+ if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+ switch (seg_id) {
+ case 0:
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+ break;
+ case 1:
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+ vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
+ break;
+ case 2:
+ vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
+ break;
+ case 3:
+ vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
+ vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
+ break;
+
+ // TRAP.. This should not happen
+ default:
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
+ vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
+ break;
+ }
+ }
+}
+
+// This function encodes the reference frame
+static void encode_ref_frame(vp9_writer *const bc,
+ VP9_COMMON *const cm,
+ MACROBLOCKD *xd,
+ int segment_id,
+ MV_REFERENCE_FRAME rf) {
+ int seg_ref_active;
+ int seg_ref_count = 0;
+ seg_ref_active = vp9_segfeature_active(xd,
+ segment_id,
+ SEG_LVL_REF_FRAME);
+
+ if (seg_ref_active) {
+ seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+ vp9_check_segref(xd, segment_id, LAST_FRAME) +
+ vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+ vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+ }
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (!seg_ref_active || (seg_ref_count > 1)) {
+ // Values used in prediction model coding
+ unsigned char prediction_flag;
+ vp9_prob pred_prob;
+ MV_REFERENCE_FRAME pred_rf;
+
+ // Get the context probability the prediction flag
+ pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+ // Get the predicted value.
+ pred_rf = vp9_get_pred_ref(cm, xd);
+
+ // Did the chosen reference frame match its predicted value.
+ prediction_flag =
+ (xd->mode_info_context->mbmi.ref_frame == pred_rf);
+
+ vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
+ vp9_write(bc, prediction_flag, pred_prob);
+
+ // If not predicted correctly then code value explicitly
+ if (!prediction_flag) {
+ vp9_prob mod_refprobs[PREDICTION_PROBS];
+
+ vpx_memcpy(mod_refprobs,
+ cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));
+
+ // If segment coding enabled blank out options that cant occur by
+ // setting the branch probability to 0.
+ if (seg_ref_active) {
+ mod_refprobs[INTRA_FRAME] *=
+ vp9_check_segref(xd, segment_id, INTRA_FRAME);
+ mod_refprobs[LAST_FRAME] *=
+ vp9_check_segref(xd, segment_id, LAST_FRAME);
+ mod_refprobs[GOLDEN_FRAME] *=
+ (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
+ vp9_check_segref(xd, segment_id, ALTREF_FRAME));
+ }
+
+ if (mod_refprobs[0]) {
+ vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
+ }
+
+ // Inter coded
+ if (rf != INTRA_FRAME) {
+ if (mod_refprobs[1]) {
+ vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
+ }
+
+ if (rf != LAST_FRAME) {
+ if (mod_refprobs[2]) {
+ vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
+ }
+ }
+ }
+ }
+ }
+
+ // if using the prediction mdoel we have nothing further to do because
+ // the reference frame is fully coded by the segment
+}
+
+// Update the probabilities used to encode reference frame data
+static void update_ref_probs(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] +
+ rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
+ cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
+ cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
+
+ // Compute a modified set of probabilities to use when prediction of the
+ // reference frame fails
+ vp9_compute_mod_refprobs(cm);
+}
+
+static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
+ int i;
+ VP9_COMMON *const pc = &cpi->common;
+ const nmv_context *nmvc = &pc->fc.nmvc;
+ MACROBLOCK *x = &cpi->mb;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ MODE_INFO *m;
+ MODE_INFO *prev_m;
+ TOKENEXTRA *tok = cpi->tok;
+ TOKENEXTRA *tok_end = tok + cpi->tok_count;
+
+ const int mis = pc->mode_info_stride;
+ int mb_row, mb_col;
+ int row, col;
+
+ // Values used in prediction model coding
+ vp9_prob pred_prob;
+ unsigned char prediction_flag;
+
+ int row_delta[4] = { 0, +1, 0, -1};
+ int col_delta[4] = { +1, -1, +1, +1};
+
+ cpi->mb.partition_info = cpi->mb.pi;
+
+ mb_row = 0;
+ for (row = 0; row < pc->mb_rows; row += 2) {
+ m = pc->mi + row * mis;
+ prev_m = pc->prev_mi + row * mis;
+
+ mb_col = 0;
+ for (col = 0; col < pc->mb_cols; col += 2) {
+ int i;
+
+ // Process the 4 MBs in the order:
+ // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+ vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
+ for (i = 0; i < 4; i++) {
+ MB_MODE_INFO *mi;
+ MV_REFERENCE_FRAME rf;
+ MB_PREDICTION_MODE mode;
+ int segment_id;
+
+ int dy = row_delta[i];
+ int dx = col_delta[i];
+ int offset_extended = dy * mis + dx;
+
+ if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
+ // MB lies outside frame, move on
+ mb_row += dy;
+ mb_col += dx;
+ m += offset_extended;
+ prev_m += offset_extended;
+ cpi->mb.partition_info += offset_extended;
+ continue;
+ }
+
+ mi = &m->mbmi;
+ rf = mi->ref_frame;
+ mode = mi->mode;
+ segment_id = mi->segment_id;
+
+ // Distance of Mb to the various image edges.
+ // These specified to 8th pel as they are always compared to MV
+ // values that are in 1/8th pel units
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ // Make sure the MacroBlockD mode info pointer is set correctly
+ xd->mode_info_context = m;
+ xd->prev_mode_info_context = prev_m;
+
+#ifdef ENTROPY_STATS
+ active_section = 9;
+#endif
+ if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+ // Is temporal coding of the segment map enabled
+ if (pc->temporal_update) {
+ prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
+ pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
+
+ // Code the segment id prediction flag for this mb
+ vp9_write(bc, prediction_flag, pred_prob);
+
+ // If the mb segment id wasn't predicted code explicitly
+ if (!prediction_flag)
+ write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ } else {
+ // Normal unpredicted coding
+ write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ }
+ }
+
+ if (pc->mb_no_coeff_skip &&
+ (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
+ int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+ if (mi->encoded_as_sb) {
+ skip_coeff &= m[1].mbmi.mb_skip_coeff;
+ skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+ skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+ }
+#endif
+ vp9_write(bc, skip_coeff,
+ vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
+ }
+
+ // Encode the reference frame.
+ encode_ref_frame(bc, pc, xd, segment_id, rf);
+
+ if (rf == INTRA_FRAME) {
+#ifdef ENTROPY_STATS
+ active_section = 6;
+#endif
+
+ // TODO(rbultje) write using SB tree structure
+
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+ write_ymode(bc, mode, pc->fc.ymode_prob);
+ }
+
+ if (mode == B_PRED) {
+ int j = 0;
+#if CONFIG_COMP_INTRA_PRED
+ int uses_second =
+ m->bmi[0].as_mode.second !=
+ (B_PREDICTION_MODE)(B_DC_PRED - 1);
+ vp9_write(bc, uses_second, 128);
+#endif
+ do {
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
+#endif
+ write_bmode(bc, m->bmi[j].as_mode.first,
+ pc->fc.bmode_prob);
+ /*
+ if (!cpi->dummy_packing) {
+ int p;
+ for (p = 0; p < VP9_BINTRAMODES - 1; ++p)
+ printf(" %d", pc->fc.bmode_prob[p]);
+ printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);
+ }
+ */
+#if CONFIG_COMP_INTRA_PRED
+ if (uses_second) {
+ write_bmode(bc, mode2, pc->fc.bmode_prob);
+ }
+#endif
+ } while (++j < 16);
+ }
+ if (mode == I8X8_PRED) {
+ write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+ pc->fc.i8x8_mode_prob);
+ write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+ pc->fc.i8x8_mode_prob);
+ write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+ pc->fc.i8x8_mode_prob);
+ write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+ pc->fc.i8x8_mode_prob);
+ } else {
+ write_uv_mode(bc, mi->uv_mode,
+ pc->fc.uv_mode_prob[mode]);
+ }
+ } else {
+ int_mv best_mv, best_second_mv;
+ int ct[4];
+
+ vp9_prob mv_ref_p [VP9_MVREFS - 1];
+
+ {
+ int_mv n1, n2;
+
+ // Only used for context just now and soon to be deprecated.
+ vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
+ rf, cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+ best_mv.as_int = mi->ref_mvs[rf][0].as_int;
+#endif
+
+ vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+ accum_mv_refs(mode, ct);
+#endif
+ }
+
+#ifdef ENTROPY_STATS
+ active_section = 3;
+#endif
+
+ // Is the segment coding of mode enabled
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+#if CONFIG_SUPERBLOCKS
+ if (mi->encoded_as_sb) {
+ write_sb_mv_ref(bc, mode, mv_ref_p);
+ } else
+#endif
+ {
+ write_mv_ref(bc, mode, mv_ref_p);
+ }
+ vp9_accum_mv_refs(&cpi->common, mode, ct);
+ }
+
+#if CONFIG_PRED_FILTER
+ // Is the prediction filter enabled
+ if (mode >= NEARESTMV && mode < SPLITMV) {
+ if (cpi->common.pred_filter_mode == 2)
+ vp9_write(bc, mi->pred_filter_enabled,
+ pc->prob_pred_filter_off);
+ else
+ assert(mi->pred_filter_enabled ==
+ cpi->common.pred_filter_mode);
+ }
+#endif
+ if (mode >= NEARESTMV && mode <= SPLITMV)
+ {
+ if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+ write_token(bc, vp9_switchable_interp_tree,
+ vp9_get_pred_probs(&cpi->common, xd,
+ PRED_SWITCHABLE_INTERP),
+ vp9_switchable_interp_encodings +
+ vp9_switchable_interp_map[mi->interp_filter]);
+ } else {
+ assert (mi->interp_filter ==
+ cpi->common.mcomp_filter_type);
+ }
+ }
+ if (mi->second_ref_frame &&
+ (mode == NEWMV || mode == SPLITMV)) {
+ int_mv n1, n2;
+
+ // Only used for context just now and soon to be deprecated.
+ vp9_find_near_mvs(xd, m, prev_m,
+ &n1, &n2, &best_second_mv, ct,
+ mi->second_ref_frame,
+ cpi->common.ref_frame_sign_bias);
+
+#if CONFIG_NEWBESTREFMV
+ best_second_mv.as_int =
+ mi->ref_mvs[mi->second_ref_frame][0].as_int;
+#endif
+ }
+
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,
+ vp9_get_pred_prob(pc, xd, PRED_COMP));
+ }
+
+ {
+ switch (mode) { /* new, split require MVs */
+ case NEWMV:
+#ifdef ENTROPY_STATS
+ active_section = 5;
+#endif
+
+#if CONFIG_NEW_MVREF
+ {
+ unsigned int best_index;
+
+ // Choose the best mv reference
+ best_index = pick_best_mv_ref(x, rf, mi->mv[0],
+ mi->ref_mvs[rf], &best_mv);
+
+ // Encode the index of the choice.
+ vp9_write_mv_ref_id(bc,
+ xd->mb_mv_ref_id_probs[rf], best_index);
+
+ cpi->best_ref_index_counts[rf][best_index]++;
+
+ }
+#endif
+
+ write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
+ (const nmv_context*) nmvc,
+ xd->allow_high_precision_mv);
+
+ if (mi->second_ref_frame) {
+#if CONFIG_NEW_MVREF
+ unsigned int best_index;
+ MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;
+
+ best_index =
+ pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],
+ mi->ref_mvs[sec_ref_frame],
+ &best_second_mv);
+
+ // Encode the index of the choice.
+ vp9_write_mv_ref_id(bc,
+ xd->mb_mv_ref_id_probs[sec_ref_frame],
+ best_index);
+
+ cpi->best_ref_index_counts[sec_ref_frame][best_index]++;
+#endif
+ write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
+ (const nmv_context*) nmvc,
+ xd->allow_high_precision_mv);
+ }
+ break;
+ case SPLITMV: {
+ int j = 0;
+
+#ifdef MODE_STATS
+ ++count_mb_seg [mi->partitioning];
+#endif
+
+ write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
+ cpi->mbsplit_count[mi->partitioning]++;
+
+ do {
+ B_PREDICTION_MODE blockmode;
+ int_mv blockmv;
+ const int *const L =
+ vp9_mbsplits [mi->partitioning];
+ int k = -1; /* first block in subset j */
+ int mv_contz;
+ int_mv leftmv, abovemv;
+
+ blockmode = cpi->mb.partition_info->bmi[j].mode;
+ blockmv = cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+ while (j != L[++k])
+ if (k >= 16)
+ assert(0);
+#else
+ while (j != L[++k]);
+#endif
+ leftmv.as_int = left_block_mv(m, k);
+ abovemv.as_int = above_block_mv(m, k, mis);
+ mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+
+ write_sub_mv_ref(bc, blockmode,
+ cpi->common.fc.sub_mv_ref_prob [mv_contz]);
+ cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
+ if (blockmode == NEW4X4) {
+#ifdef ENTROPY_STATS
+ active_section = 11;
+#endif
+ write_nmv(bc, &blockmv.as_mv, &best_mv,
+ (const nmv_context*) nmvc,
+ xd->allow_high_precision_mv);
+
+ if (mi->second_ref_frame) {
+ write_nmv(bc,
+ &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+ &best_second_mv,
+ (const nmv_context*) nmvc,
+ xd->allow_high_precision_mv);
+ }
+ }
+ } while (++j < cpi->mb.partition_info->count);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ // Update the mvcounts used to tune mv probs but only if this is
+ // the real pack run.
+ if ( !cpi->dummy_packing ) {
+ update_mvcount(cpi, x, &best_mv, &best_second_mv);
+ }
+ }
+
+ if (
+#if CONFIG_SUPERBLOCKS
+ !mi->encoded_as_sb &&
+#endif
+ ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+ (rf != INTRA_FRAME && !(mode == SPLITMV &&
+ mi->partitioning == PARTITIONING_4X4))) &&
+ pc->txfm_mode == TX_MODE_SELECT &&
+ !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
+ (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+ TX_SIZE sz = mi->txfm_size;
+ // FIXME(rbultje) code ternary symbol once all experiments are merged
+ vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+ if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
+ vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+ }
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+ assert(tok < tok_end);
+ pack_mb_tokens(bc, &tok, tok_end);
+
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb) {
+ assert(!i);
+ mb_col += 2;
+ m += 2;
+ cpi->mb.partition_info += 2;
+ prev_m += 2;
+ break;
+ }
+#endif
+
+ // Next MB
+ mb_row += dy;
+ mb_col += dx;
+ m += offset_extended;
+ prev_m += offset_extended;
+ cpi->mb.partition_info += offset_extended;
+#if CONFIG_DEBUG
+ assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
+ assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
+#endif
+ }
+ }
+
+ // Next SB
+ mb_row += 2;
+ m += mis + (1 - (pc->mb_cols & 0x1));
+ prev_m += mis + (1 - (pc->mb_cols & 0x1));
+ cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
+ }
+}
+
+
+static void write_mb_modes_kf(const VP9_COMMON *c,
+ const MACROBLOCKD *xd,
+ const MODE_INFO *m,
+ int mode_info_stride,
+ vp9_writer *const bc) {
+ const int mis = mode_info_stride;
+ int ym;
+ int segment_id;
+
+ ym = m->mbmi.mode;
+ segment_id = m->mbmi.segment_id;
+
+ if (xd->update_mb_segmentation_map) {
+ write_mb_segid(bc, &m->mbmi, xd);
+ }
+
+ if (c->mb_no_coeff_skip &&
+ (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
+ int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb) {
+ skip_coeff &= m[1].mbmi.mb_skip_coeff;
+ skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+ skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+ }
+#endif
+ vp9_write(bc, skip_coeff,
+ vp9_get_pred_prob(c, xd, PRED_MBSKIP));
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb) {
+ sb_kfwrite_ymode(bc, ym,
+ c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+ } else
+#endif
+ {
+ kfwrite_ymode(bc, ym,
+ c->kf_ymode_prob[c->kf_ymode_probs_index]);
+ }
+
+ if (ym == B_PRED) {
+ const int mis = c->mode_info_stride;
+ int i = 0;
+#if CONFIG_COMP_INTRA_PRED
+ int uses_second =
+ m->bmi[0].as_mode.second !=
+ (B_PREDICTION_MODE)(B_DC_PRED - 1);
+ vp9_write(bc, uses_second, 128);
+#endif
+ do {
+ const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(m, i);
+ const int bm = m->bmi[i].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ const int bm2 = m->bmi[i].as_mode.second;
+#endif
+
+#ifdef ENTROPY_STATS
+ ++intra_mode_stats [A] [L] [bm];
+#endif
+
+ write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+ // printf(" mode: %d\n", bm);
+#if CONFIG_COMP_INTRA_PRED
+ if (uses_second) {
+ write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);
+ }
+#endif
+ } while (++i < 16);
+ }
+ if (ym == I8X8_PRED) {
+ write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+ c->fc.i8x8_mode_prob);
+ // printf(" mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
+ write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+ c->fc.i8x8_mode_prob);
+ // printf(" mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
+ write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+ c->fc.i8x8_mode_prob);
+ // printf(" mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
+ write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+ c->fc.i8x8_mode_prob);
+ // printf(" mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
+ } else
+ write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
+ if (
+#if CONFIG_SUPERBLOCKS
+ !m->mbmi.encoded_as_sb &&
+#endif
+ ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+ !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+ (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+ TX_SIZE sz = m->mbmi.txfm_size;
+ // FIXME(rbultje) code ternary symbol once all experiments are merged
+ vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
+ if (sz != TX_4X4 && ym <= TM_PRED)
+ vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+ }
+}
+
+static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
+ VP9_COMMON *const c = &cpi->common;
+ const int mis = c->mode_info_stride;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ MODE_INFO *m;
+ int i;
+ int row, col;
+ int mb_row, mb_col;
+ int row_delta[4] = { 0, +1, 0, -1};
+ int col_delta[4] = { +1, -1, +1, +1};
+ TOKENEXTRA *tok = cpi->tok;
+ TOKENEXTRA *tok_end = tok + cpi->tok_count;
+
+ mb_row = 0;
+ for (row = 0; row < c->mb_rows; row += 2) {
+ m = c->mi + row * mis;
+
+ mb_col = 0;
+ for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+ vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
+ // Process the 4 MBs in the order:
+ // top-left, top-right, bottom-left, bottom-right
+ for (i = 0; i < 4; i++) {
+ int dy = row_delta[i];
+ int dx = col_delta[i];
+ int offset_extended = dy * mis + dx;
+
+ if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
+ // MB lies outside frame, move on
+ mb_row += dy;
+ mb_col += dx;
+ m += offset_extended;
+ continue;
+ }
+
+ // Make sure the MacroBlockD mode info pointer is set correctly
+ xd->mode_info_context = m;
+
+ write_mb_modes_kf(c, xd, m, mis, bc);
+#ifdef ENTROPY_STATS
+ active_section = 8;
+#endif
+ assert(tok < tok_end);
+ pack_mb_tokens(bc, &tok, tok_end);
+
+#if CONFIG_SUPERBLOCKS
+ if (m->mbmi.encoded_as_sb) {
+ assert(!i);
+ mb_col += 2;
+ m += 2;
+ break;
+ }
+#endif
+ // Next MB
+ mb_row += dy;
+ mb_col += dx;
+ m += offset_extended;
+ }
+ }
+ mb_row += 2;
+ }
+}
+
+
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp9_prob
+ coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+ /* print coef probability tree */
+ int i, j, k, l;
+ FILE *f = fopen("enc_tree_probs.txt", "a");
+ fprintf(f, "{\n");
+ for (i = 0; i < BLOCK_TYPES; i++) {
+ fprintf(f, " {\n");
+ for (j = 0; j < COEF_BANDS; j++) {
+ fprintf(f, " {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ fprintf(f, " {");
+ for (l = 0; l < ENTROPY_NODES; l++) {
+ fprintf(f, "%3u, ",
+ (unsigned int)(coef_probs [i][j][k][l]));
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, "}\n");
+ fclose(f);
+}
+
+static void build_coeff_contexts(VP9_COMP *cpi) {
+ int i = 0, j, k;
+#ifdef ENTROPY_STATS
+ int t = 0;
+#endif
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_coef_probs [i][j][k],
+ cpi->frame_branch_ct [i][j][k],
+ cpi->coef_counts [i][j][k],
+ 256, 1
+ );
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
+#endif
+ }
+ }
+ }
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_hybrid_coef_probs [i][j][k],
+ cpi->frame_hybrid_branch_ct [i][j][k],
+ cpi->hybrid_coef_counts [i][j][k],
+ 256, 1
+ );
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];
+#endif
+ }
+ }
+ }
+
+ if (cpi->common.txfm_mode != ONLY_4X4) {
+ for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ /* at every context */
+ /* calc probs and branch cts for this frame only */
+ // vp9_prob new_p [ENTROPY_NODES];
+ // unsigned int branch_ct [ENTROPY_NODES] [2];
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_coef_probs_8x8 [i][j][k],
+ cpi->frame_branch_ct_8x8 [i][j][k],
+ cpi->coef_counts_8x8 [i][j][k],
+ 256, 1
+ );
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
+#endif
+ }
+ }
+ }
+ for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ /* at every context */
+ /* calc probs and branch cts for this frame only */
+ // vp9_prob new_p [ENTROPY_NODES];
+ // unsigned int branch_ct [ENTROPY_NODES] [2];
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_hybrid_coef_probs_8x8 [i][j][k],
+ cpi->frame_hybrid_branch_ct_8x8 [i][j][k],
+ cpi->hybrid_coef_counts_8x8 [i][j][k],
+ 256, 1
+ );
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];
+#endif
+ }
+ }
+ }
+ }
+
+ if (cpi->common.txfm_mode > ALLOW_8X8) {
+ for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_coef_probs_16x16[i][j][k],
+ cpi->frame_branch_ct_16x16[i][j][k],
+ cpi->coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+#endif
+ }
+ }
+ }
+ }
+ for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+ for (j = 0; j < COEF_BANDS; ++j) {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ cpi->frame_hybrid_coef_probs_16x16[i][j][k],
+ cpi->frame_hybrid_branch_ct_16x16[i][j][k],
+ cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
+#endif
+ }
+ }
+ }
+}
+
+static void update_coef_probs_common(
+ vp9_writer* const bc,
+ vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES],
+ vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES],
+ unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {
+ int i, j, k, t;
+ int update[2] = {0, 0};
+ int savings;
+ // vp9_prob bestupd = find_coef_update_prob(cpi);
+
+ /* dry run to see if there is any udpate at all needed */
+ savings = 0;
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = !i; j < COEF_BANDS; ++j) {
+ int prev_coef_savings[ENTROPY_NODES] = {0};
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][t];
+ const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];
+ const vp9_prob upd = COEF_UPDATE_PROB;
+ int s = prev_coef_savings[t];
+ int u = 0;
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+#if defined(SEARCH_NEWP)
+ s = prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][t],
+ oldp, &newp, upd);
+ if (s > 0 && newp != oldp)
+ u = 1;
+ if (u)
+ savings += s - (int)(vp9_cost_zero(upd));
+ else
+ savings -= (int)(vp9_cost_zero(upd));
+#else
+ s = prob_update_savings(
+ frame_branch_ct[i][j][k][t],
+ oldp, newp, upd);
+ if (s > 0)
+ u = 1;
+ if (u)
+ savings += s;
+#endif
+
+ update[u]++;
+ }
+ }
+ }
+ }
+
+ // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ vp9_write_bit(bc, 0);
+ } else {
+ vp9_write_bit(bc, 1);
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = !i; j < COEF_BANDS; ++j) {
+ int prev_coef_savings[ENTROPY_NODES] = {0};
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < ENTROPY_NODES; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][t];
+ vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;
+ const vp9_prob upd = COEF_UPDATE_PROB;
+ int s = prev_coef_savings[t];
+ int u = 0;
+ if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+ continue;
+
+#if defined(SEARCH_NEWP)
+ s = prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][t],
+ *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp)
+ u = 1;
+#else
+ s = prob_update_savings(
+ frame_branch_ct[i][j][k][t],
+ *oldp, newp, upd);
+ if (s > 0)
+ u = 1;
+#endif
+ vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++ tree_update_hist [i][j][k][t] [u];
+#endif
+ if (u) {
+ /* send/use new probability */
+ write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
+ vp9_clear_system_state();
+
+ // Build the cofficient contexts based on counts collected in encode loop
+ build_coeff_contexts(cpi);
+
+ update_coef_probs_common(bc,
+ cpi->frame_coef_probs,
+ cpi->common.fc.coef_probs,
+ cpi->frame_branch_ct);
+
+ update_coef_probs_common(bc,
+ cpi->frame_hybrid_coef_probs,
+ cpi->common.fc.hybrid_coef_probs,
+ cpi->frame_hybrid_branch_ct);
+
+ /* do not do this if not even allowed */
+ if (cpi->common.txfm_mode != ONLY_4X4) {
+ update_coef_probs_common(bc,
+ cpi->frame_coef_probs_8x8,
+ cpi->common.fc.coef_probs_8x8,
+ cpi->frame_branch_ct_8x8);
+
+ update_coef_probs_common(bc,
+ cpi->frame_hybrid_coef_probs_8x8,
+ cpi->common.fc.hybrid_coef_probs_8x8,
+ cpi->frame_hybrid_branch_ct_8x8);
+ }
+
+ if (cpi->common.txfm_mode > ALLOW_8X8) {
+ update_coef_probs_common(bc,
+ cpi->frame_coef_probs_16x16,
+ cpi->common.fc.coef_probs_16x16,
+ cpi->frame_branch_ct_16x16);
+ update_coef_probs_common(bc,
+ cpi->frame_hybrid_coef_probs_16x16,
+ cpi->common.fc.hybrid_coef_probs_16x16,
+ cpi->frame_hybrid_branch_ct_16x16);
+ }
+}
+
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+ if (delta_q != 0) {
+ vp9_write_bit(bc, 1);
+ vp9_write_literal(bc, abs(delta_q), 4);
+
+ if (delta_q < 0)
+ vp9_write_bit(bc, 1);
+ else
+ vp9_write_bit(bc, 0);
+ } else
+ vp9_write_bit(bc, 0);
+}
+
+static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
+
+ int mode_cost[MB_MODE_COUNT];
+ int cost;
+ int bestcost = INT_MAX;
+ int bestindex = 0;
+ int i, j;
+
+ for (i = 0; i < 8; i++) {
+ vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
+ cost = 0;
+ for (j = 0; j < VP9_YMODES; j++) {
+ cost += mode_cost[j] * cpi->ymode_count[j];
+ }
+#if CONFIG_SUPERBLOCKS
+ vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+ vp9_sb_ymode_tree);
+ for (j = 0; j < VP9_I32X32_MODES; j++) {
+ cost += mode_cost[j] * cpi->sb_ymode_count[j];
+ }
+#endif
+ if (cost < bestcost) {
+ bestindex = i;
+ bestcost = cost;
+ }
+ }
+ cpi->common.kf_ymode_probs_index = bestindex;
+
+}
+static void segment_reference_frames(VP9_COMP *cpi) {
+ VP9_COMMON *oci = &cpi->common;
+ MODE_INFO *mi = oci->mi;
+ int ref[MAX_MB_SEGMENTS] = {0};
+ int i, j;
+ int mb_index = 0;
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+ for (i = 0; i < oci->mb_rows; i++) {
+ for (j = 0; j < oci->mb_cols; j++, mb_index++) {
+ ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);
+ }
+ mb_index++;
+ }
+ for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);
+ }
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
+ unsigned long *size) {
+ int i, j;
+ VP9_HEADER oh;
+ VP9_COMMON *const pc = &cpi->common;
+ vp9_writer header_bc, residual_bc;
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ int extra_bytes_packed = 0;
+
+ unsigned char *cx_data = dest;
+
+ oh.show_frame = (int) pc->show_frame;
+ oh.type = (int)pc->frame_type;
+ oh.version = pc->version;
+ oh.first_partition_length_in_bytes = 0;
+
+ cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+ Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
+#endif
+
+ compute_update_table();
+
+ /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once
+ * for each K frame before encode frame. pc->kf_bmode_prob doesn't get
+ * changed anywhere else. No need to call it again here. --yw
+ * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);
+ */
+
+ /* every keyframe send startcode, width, height, scale factor, clamp
+ * and color type.
+ */
+ if (oh.type == KEY_FRAME) {
+ int v;
+
+ // Start / synch code
+ cx_data[0] = 0x9D;
+ cx_data[1] = 0x01;
+ cx_data[2] = 0x2a;
+
+ v = (pc->horiz_scale << 14) | pc->Width;
+ cx_data[3] = v;
+ cx_data[4] = v >> 8;
+
+ v = (pc->vert_scale << 14) | pc->Height;
+ cx_data[5] = v;
+ cx_data[6] = v >> 8;
+
+ extra_bytes_packed = 7;
+ cx_data += extra_bytes_packed;
+
+ vp9_start_encode(&header_bc, cx_data);
+
+ // signal clr type
+ vp9_write_bit(&header_bc, pc->clr_type);
+ vp9_write_bit(&header_bc, pc->clamp_type);
+
+ } else {
+ vp9_start_encode(&header_bc, cx_data);
+ }
+
+ // Signal whether or not Segmentation is enabled
+ vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
+
+ // Indicate which features are enabled
+ if (xd->segmentation_enabled) {
+ // Indicate whether or not the segmentation map is being updated.
+ vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+
+ // If it is, then indicate the method that will be used.
+ if (xd->update_mb_segmentation_map) {
+ // Select the coding strategy (temporal or spatial)
+ vp9_choose_segmap_coding_method(cpi);
+ // Send the tree probabilities used to decode unpredicted
+ // macro-block segments
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+ int data = xd->mb_segment_tree_probs[i];
+
+ if (data != 255) {
+ vp9_write_bit(&header_bc, 1);
+ vp9_write_literal(&header_bc, data, 8);
+ } else {
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+
+ // Write out the chosen coding method.
+ vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
+ if (pc->temporal_update) {
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ int data = pc->segment_pred_probs[i];
+
+ if (data != 255) {
+ vp9_write_bit(&header_bc, 1);
+ vp9_write_literal(&header_bc, data, 8);
+ } else {
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+ }
+ }
+
+ vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+
+ // segment_reference_frames(cpi);
+
+ if (xd->update_mb_segmentation_data) {
+ signed char Data;
+
+ vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
+
+ // For each segments id...
+ for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ // For each segmentation codable feature...
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ Data = vp9_get_segdata(xd, i, j);
+
+ // If the feature is enabled...
+ if (vp9_segfeature_active(xd, i, j)) {
+ vp9_write_bit(&header_bc, 1);
+
+ // Is the segment data signed..
+ if (vp9_is_segfeature_signed(j)) {
+ // Encode the relevant feature data
+ if (Data < 0) {
+ Data = - Data;
+ vp9_write_literal(&header_bc, Data,
+ vp9_seg_feature_data_bits(j));
+ vp9_write_bit(&header_bc, 1);
+ } else {
+ vp9_write_literal(&header_bc, Data,
+ vp9_seg_feature_data_bits(j));
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+ // Unsigned data element so no sign bit needed
+ else
+ vp9_write_literal(&header_bc, Data,
+ vp9_seg_feature_data_bits(j));
+ } else
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+ }
+ }
+
+ // Encode the common prediction model status flag probability updates for
+ // the reference frame
+ update_refpred_stats(cpi);
+ if (pc->frame_type != KEY_FRAME) {
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ if (cpi->ref_pred_probs_update[i]) {
+ vp9_write_bit(&header_bc, 1);
+ vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
+ } else {
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+ }
+
+#if CONFIG_SUPERBLOCKS
+ {
+ /* sb mode probability */
+ const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+
+ pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
+ vp9_write_literal(&header_bc, pc->sb_coded, 8);
+ }
+#endif
+
+ {
+ if (pc->txfm_mode == TX_MODE_SELECT) {
+ pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
+ cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
+ cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
+ pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
+ } else {
+ pc->prob_tx[0] = 128;
+ pc->prob_tx[1] = 128;
+ }
+ vp9_write_literal(&header_bc, pc->txfm_mode, 2);
+ if (pc->txfm_mode == TX_MODE_SELECT) {
+ vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
+ vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
+ }
+ }
+
+ // Encode the loop filter level and type
+ vp9_write_bit(&header_bc, pc->filter_type);
+ vp9_write_literal(&header_bc, pc->filter_level, 6);
+ vp9_write_literal(&header_bc, pc->sharpness_level, 3);
+
+ // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
+ vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+
+ if (xd->mode_ref_lf_delta_enabled) {
+ // Do the deltas need to be updated
+ int send_update = xd->mode_ref_lf_delta_update;
+
+ vp9_write_bit(&header_bc, send_update);
+ if (send_update) {
+ int Data;
+
+ // Send update
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+ Data = xd->ref_lf_deltas[i];
+
+ // Frame level data
+ if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
+ xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
+ vp9_write_bit(&header_bc, 1);
+
+ if (Data > 0) {
+ vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+ vp9_write_bit(&header_bc, 0); // sign
+ } else {
+ Data = -Data;
+ vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+ vp9_write_bit(&header_bc, 1); // sign
+ }
+ } else {
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+
+ // Send update
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ Data = xd->mode_lf_deltas[i];
+
+ if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
+ xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
+ vp9_write_bit(&header_bc, 1);
+
+ if (Data > 0) {
+ vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+ vp9_write_bit(&header_bc, 0); // sign
+ } else {
+ Data = -Data;
+ vp9_write_literal(&header_bc, (Data & 0x3F), 6);
+ vp9_write_bit(&header_bc, 1); // sign
+ }
+ } else {
+ vp9_write_bit(&header_bc, 0);
+ }
+ }
+ }
+ }
+
+ // signal here is multi token partition is enabled
+ // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);
+ vp9_write_literal(&header_bc, 0, 2);
+
+ // Frame Q baseline quantizer index
+ vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
+
+ // Transmit Dc, Second order and Uv quantizer delta information
+ put_delta_q(&header_bc, pc->y1dc_delta_q);
+ put_delta_q(&header_bc, pc->y2dc_delta_q);
+ put_delta_q(&header_bc, pc->y2ac_delta_q);
+ put_delta_q(&header_bc, pc->uvdc_delta_q);
+ put_delta_q(&header_bc, pc->uvac_delta_q);
+
+ // When there is a key frame all reference buffers are updated using the new key frame
+ if (pc->frame_type != KEY_FRAME) {
+ // Should the GF or ARF be updated using the transmitted frame or buffer
+ vp9_write_bit(&header_bc, pc->refresh_golden_frame);
+ vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);
+
+ // For inter frames the current default behavior is that when
+ // cm->refresh_golden_frame is set we copy the old GF over to
+ // the ARF buffer. This is purely an encoder decision at present.
+ if (pc->refresh_golden_frame)
+ pc->copy_buffer_to_arf = 2;
+
+ // If not being updated from current frame should either GF or ARF be updated from another buffer
+ if (!pc->refresh_golden_frame)
+ vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);
+
+ if (!pc->refresh_alt_ref_frame)
+ vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);
+
+ // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
+ vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+ vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+
+ // Signal whether to allow high MV precision
+ vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
+ if (pc->mcomp_filter_type == SWITCHABLE) {
+ /* Check to see if only one of the filters is actually used */
+ int count[VP9_SWITCHABLE_FILTERS];
+ int i, j, c = 0;
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ count[i] = 0;
+ for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+ count[i] += cpi->switchable_interp_count[j][i];
+ }
+ c += (count[i] > 0);
+ }
+ if (c == 1) {
+ /* Only one filter is used. So set the filter at frame level */
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ pc->mcomp_filter_type = vp9_switchable_interp[i];
+ break;
+ }
+ }
+ }
+ }
+ // Signal the type of subpel filter to use
+ vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
+ if (pc->mcomp_filter_type != SWITCHABLE)
+ vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
+ }
+
+ vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
+
+ if (pc->frame_type != KEY_FRAME)
+ vp9_write_bit(&header_bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+ if (pc->frame_type == INTER_FRAME)
+ active_section = 0;
+ else
+ active_section = 7;
+#endif
+
+ vp9_clear_system_state(); // __asm emms;
+
+ vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
+ vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);
+ vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
+ vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
+ vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
+ vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
+ vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
+ vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
+ vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
+ vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
+ vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
+ vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
+ cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
+ vp9_zero(cpi->sub_mv_ref_count);
+ vp9_zero(cpi->mbsplit_count);
+ vp9_zero(cpi->common.fc.mv_ref_ct)
+ vp9_zero(cpi->common.fc.mv_ref_ct_a)
+
+ update_coef_probs(cpi, &header_bc);
+
+#ifdef ENTROPY_STATS
+ active_section = 2;
+#endif
+
+ // Write out the mb_no_coeff_skip flag
+ vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);
+ if (pc->mb_no_coeff_skip) {
+ int k;
+
+ vp9_update_skip_probs(cpi);
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
+ }
+
+ if (pc->frame_type == KEY_FRAME) {
+ if (!pc->kf_ymode_probs_update) {
+ vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
+ }
+ } else {
+ // Update the probabilities used to encode reference frame data
+ update_ref_probs(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+
+#if CONFIG_PRED_FILTER
+ // Write the prediction filter mode used for this frame
+ vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);
+
+ // Write prediction filter on/off probability if signaling at MB level
+ if (pc->pred_filter_mode == 2)
+ vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
+
+#endif
+ if (pc->mcomp_filter_type == SWITCHABLE)
+ update_switchable_interp_probs(cpi, &header_bc);
+
+ vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);
+ vp9_write_literal(&header_bc, pc->prob_last_coded, 8);
+ vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);
+
+ {
+ const int comp_pred_mode = cpi->common.comp_pred_mode;
+ const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
+ const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
+
+ vp9_write(&header_bc, use_compound_pred, 128);
+ if (use_compound_pred) {
+ vp9_write(&header_bc, use_hybrid_pred, 128);
+ if (use_hybrid_pred) {
+ for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
+ pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
+ cpi->comp_pred_count[i]);
+ vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);
+ }
+ }
+ }
+ }
+
+ update_mbintra_mode_probs(cpi, &header_bc);
+
+#if CONFIG_NEW_MVREF
+ // Temp defaults probabilities for ecnoding the MV ref id signal
+ vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
+#endif
+
+ vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);
+ }
+
+ vp9_stop_encode(&header_bc);
+
+ oh.first_partition_length_in_bytes = header_bc.pos;
+
+ /* update frame tag */
+ {
+ int v = (oh.first_partition_length_in_bytes << 5) |
+ (oh.show_frame << 4) |
+ (oh.version << 1) |
+ oh.type;
+
+ dest[0] = v;
+ dest[1] = v >> 8;
+ dest[2] = v >> 16;
+ }
+
+ *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
+ vp9_start_encode(&residual_bc, cx_data + header_bc.pos);
+
+ if (pc->frame_type == KEY_FRAME) {
+ decide_kf_ymode_entropy(cpi);
+ write_kfmodes(cpi, &residual_bc);
+ } else {
+ pack_inter_mode_mvs(cpi, &residual_bc);
+ vp9_update_mode_context(&cpi->common);
+ }
+
+
+ vp9_stop_encode(&residual_bc);
+
+ *size += residual_bc.pos;
+
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs() {
+ int i, j, k, l;
+ FILE *f = fopen("coefupdprob.h", "w");
+ int Sum;
+ fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+
+ fprintf(f, "const vp9_prob\n"
+ "vp9_coef_update_probs[BLOCK_TYPES]\n"
+ " [COEF_BANDS]\n"
+ " [PREV_COEF_CONTEXTS]\n"
+ " [ENTROPY_NODES] = {\n");
+ for (i = 0; i < BLOCK_TYPES; i++) {
+ fprintf(f, " { \n");
+ for (j = 0; j < COEF_BANDS; j++) {
+ fprintf(f, " {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ fprintf(f, " {");
+ for (l = 0; l < ENTROPY_NODES; l++) {
+ fprintf(f, "%3ld, ",
+ get_binary_prob(tree_update_hist[i][j][k][l][0],
+ tree_update_hist[i][j][k][l][1]));
+ }
+ fprintf(f, "},\n");
+ }
+ fprintf(f, " },\n");
+ }
+ fprintf(f, " },\n");
+ }
+ fprintf(f, "};\n");
+
+ fprintf(f, "const vp9_prob\n"
+ "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"
+ " [COEF_BANDS]\n"
+ " [PREV_COEF_CONTEXTS]\n"
+ " [ENTROPY_NODES] = {\n");
+ for (i = 0; i < BLOCK_TYPES_8X8; i++) {
+ fprintf(f, " { \n");
+ for (j = 0; j < COEF_BANDS; j++) {
+ fprintf(f, " {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ fprintf(f, " {");
+ for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
+ fprintf(f, "%3ld, ",
+ get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
+ tree_update_hist_8x8[i][j][k][l][1]));
+ }
+ fprintf(f, "},\n");
+ }
+ fprintf(f, " },\n");
+ }
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "const vp9_prob\n"
+ "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
+ " [COEF_BANDS]\n"
+ " [PREV_COEF_CONTEXTS]\n"
+ " [ENTROPY_NODES] = {\n");
+ for (i = 0; i < BLOCK_TYPES_16X16; i++) {
+ fprintf(f, " { \n");
+ for (j = 0; j < COEF_BANDS; j++) {
+ fprintf(f, " {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ fprintf(f, " {");
+ for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
+ fprintf(f, "%3ld, ",
+ get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
+ tree_update_hist_16x16[i][j][k][l][1]));
+ }
+ fprintf(f, "},\n");
+ }
+ fprintf(f, " },\n");
+ }
+ fprintf(f, " },\n");
+ }
+
+ fclose(f);
+ f = fopen("treeupdate.bin", "wb");
+ fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
+ fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+ fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+ fclose(f);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/bitstream.h
@@ -1,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+void vp9_update_skip_probs(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/block.h
@@ -1,0 +1,184 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "vp9/common/onyx.h"
+#include "vp9/common/entropymv.h"
+#include "vp9/common/entropy.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/onyxc_int.h"
+
+// motion search site
+typedef struct {
+ MV mv;
+ int offset;
+} search_site;
+
+typedef struct block {
+ // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+ short *src_diff;
+ short *coeff;
+
+ // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+ short *quant;
+ short *quant_fast; // fast quant deprecated for now
+ unsigned char *quant_shift;
+ short *zbin;
+ short *zbin_8x8;
+ short *zbin_16x16;
+ short *zrun_zbin_boost;
+ short *zrun_zbin_boost_8x8;
+ short *zrun_zbin_boost_16x16;
+ short *round;
+
+ // Zbin Over Quant value
+ short zbin_extra;
+
+ unsigned char **base_src;
+ unsigned char **base_second_src;
+ int src;
+ int src_stride;
+
+ int eob_max_offset;
+ int eob_max_offset_8x8;
+ int eob_max_offset_16x16;
+} BLOCK;
+
+typedef struct {
+ int count;
+ struct {
+ B_PREDICTION_MODE mode;
+ int_mv mv;
+ int_mv second_mv;
+ } bmi[16];
+} PARTITION_INFO;
+
+// Structure to hold snapshot of coding context during the mode picking process
+// TODO Do we need all of these?
+typedef struct {
+ MODE_INFO mic;
+ PARTITION_INFO partition_info;
+ int_mv best_ref_mv;
+ int_mv second_best_ref_mv;
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
+ int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+ int rate;
+ int distortion;
+ int64_t intra_error;
+ int best_mode_index;
+ int rddiv;
+ int rdmult;
+ int hybrid_pred_diff;
+ int comp_pred_diff;
+ int single_pred_diff;
+ int64_t txfm_rd_diff[NB_TXFM_MODES];
+} PICK_MODE_CONTEXT;
+
+typedef struct macroblock {
+ DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+ DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+ DECLARE_ALIGNED(16, unsigned char, thismb[256]); // 16x16 Y
+
+ unsigned char *thismb_ptr;
+ // 16 Y blocks, 4 U blocks, 4 V blocks,
+ // 1 DC 2nd order block each with 16 entries
+ BLOCK block[25];
+
+ YV12_BUFFER_CONFIG src;
+
+ MACROBLOCKD e_mbd;
+ PARTITION_INFO *partition_info; /* work pointer */
+ PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
+ PARTITION_INFO *pip; /* Base of allocated array */
+
+ search_site *ss;
+ int ss_count;
+ int searches_per_step;
+
+ int errorperbit;
+ int sadperbit16;
+ int sadperbit4;
+ int rddiv;
+ int rdmult;
+ unsigned int *mb_activity_ptr;
+ int *mb_norm_activity_ptr;
+ signed int act_zbin_adj;
+
+ int nmvjointcost[MV_JOINTS];
+ int nmvcosts[2][MV_VALS];
+ int *nmvcost[2];
+ int nmvcosts_hp[2][MV_VALS];
+ int *nmvcost_hp[2];
+
+ int nmvjointsadcost[MV_JOINTS];
+ int nmvsadcosts[2][MV_VALS];
+ int *nmvsadcost[2];
+ int nmvsadcosts_hp[2][MV_VALS];
+ int *nmvsadcost_hp[2];
+
+ int mbmode_cost[2][MB_MODE_COUNT];
+ int intra_uv_mode_cost[2][MB_MODE_COUNT];
+ int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
+ int i8x8_mode_costs[MB_MODE_COUNT];
+ int inter_bmode_costs[B_MODE_COUNT];
+ int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS];
+
+ // These define limits to motion vector components to prevent them
+ // from extending outside the UMV borders
+ int mv_col_min;
+ int mv_col_max;
+ int mv_row_min;
+ int mv_row_max;
+
+ int skip;
+
+ int encode_breakout;
+
+ // char * gf_active_ptr;
+ signed char *gf_active_ptr;
+
+ unsigned char *active_ptr;
+
+ unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+ unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+
+ int optimize;
+
+ // Structure to hold context for each of the 4 MBs within a SB:
+ // when encoded as 4 independent MBs:
+ PICK_MODE_CONTEXT mb_context[4];
+#if CONFIG_SUPERBLOCKS
+ // when 4 MBs share coding parameters:
+ PICK_MODE_CONTEXT sb_context[4];
+#endif
+
+ void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
+ void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
+ void (*short_walsh4x4)(short *input, short *output, int pitch);
+ void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
+ void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+ void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);
+ void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);
+ void (*short_fhaar2x2)(short *input, short *output, int pitch);
+ void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
+ void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
+ void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
+
+} MACROBLOCK;
+
+
+#endif
--- /dev/null
+++ b/vp9/encoder/boolhuff.c
@@ -1,0 +1,153 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp9_prob_cost[256] = {
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
+void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
+
+ br->lowvalue = 0;
+ br->range = 255;
+ br->value = 0;
+ br->count = -24;
+ br->buffer = source;
+ br->pos = 0;
+}
+
+void vp9_stop_encode(BOOL_CODER *br) {
+ int i;
+
+ for (i = 0; i < 32; i++)
+ encode_bool(br, 0, 128);
+}
+
+
+void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ encode_bool(br, (1 & (data >> bit)), 0x80);
+}
+
+int vp9_recenter_nonneg(int v, int m) {
+ if (v > (m << 1)) return v;
+ else if (v >= m) return ((v - m) << 1);
+ else return ((m - v) << 1) - 1;
+}
+
+static int get_unsigned_bits(unsigned num_values) {
+ int cat = 0;
+ if ((num_values--) <= 1) return 0;
+ while (num_values > 0) {
+ cat++;
+ num_values >>= 1;
+ }
+ return cat;
+}
+
+void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0) return;
+ m = (1 << l) - n;
+ if (v < m)
+ vp9_encode_value(br, v, l - 1);
+ else {
+ vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
+ vp9_encode_value(br, (v - m) & 1, 1);
+ }
+}
+
+int vp9_count_uniform(int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0) return 0;
+ m = (1 << l) - n;
+ if (v < m)
+ return l - 1;
+ else
+ return l;
+}
+
+void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ vp9_encode_uniform(br, word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ vp9_encode_value(br, t, 1);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ vp9_encode_value(br, word - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+int vp9_count_term_subexp(int word, int k, int num_syms) {
+ int count = 0;
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ count += vp9_count_uniform(word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ count++;
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ count += b;
+ break;
+ }
+ }
+ }
+ return count;
+}
--- /dev/null
+++ b/vp9/encoder/boolhuff.h
@@ -1,0 +1,111 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : boolhuff.h
+*
+* Description : Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+#include "vpx_ports/mem.h"
+
+typedef struct {
+ unsigned int lowvalue;
+ unsigned int range;
+ unsigned int value;
+ int count;
+ unsigned int pos;
+ unsigned char *buffer;
+
+ // Variables used to track bit costs without outputing to the bitstream
+ unsigned int measure_cost;
+ unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
+
+extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp9_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp9_prob_cost[256];
+
+extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
+extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
+extern int vp9_count_uniform(int v, int n);
+extern int vp9_count_term_subexp(int v, int k, int n);
+extern int vp9_recenter_nonneg(int v, int m);
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+
+
+static void encode_bool(BOOL_CODER *br, int bit, int probability) {
+ unsigned int split;
+ int count = br->count;
+ unsigned int range = br->range;
+ unsigned int lowvalue = br->lowvalue;
+ register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (bit)
+ Sectionbits[active_section] += vp9_prob_cost[255 - probability];
+ else
+ Sectionbits[active_section] += vp9_prob_cost[probability];
+
+#endif
+#endif
+
+ split = 1 + (((range - 1) * probability) >> 8);
+
+ range = split;
+
+ if (bit) {
+ lowvalue += split;
+ range = br->range - split;
+ }
+
+ shift = vp9_norm[range];
+
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0) {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000) {
+ int x = br->pos - 1;
+
+ while (x >= 0 && br->buffer[x] == 0xff) {
+ br->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ br->buffer[x] += 1;
+ }
+
+ br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8;
+ }
+
+ lowvalue <<= shift;
+ br->count = count;
+ br->lowvalue = lowvalue;
+ br->range = range;
+}
+
+#endif
--- /dev/null
+++ b/vp9/encoder/dct.c
@@ -1,0 +1,1109 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <assert.h>
+#include <math.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/idct.h"
+#include "vp9/common/systemdependent.h"
+
+#include "vp9/common/blockd.h"
+
+// TODO: these transforms can be converted into integer forms to reduce
+// the complexity
+static const float dct_4[16] = {
+ 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,
+ 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,
+ 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,
+ 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099
+};
+
+static const float adst_4[16] = {
+ 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139,
+ 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626,
+ 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359,
+ 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779
+};
+
+static const float dct_8[64] = {
+ 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274,
+ 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274,
+ 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.097545161008064,
+ -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.490392640201615,
+ 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.461939766255643,
+ -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.461939766255643,
+ 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.277785116509801,
+ 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.415734806151273,
+ 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274,
+ 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274,
+ 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.415734806151273,
+ -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.277785116509801,
+ 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.191341716182545,
+ -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.191341716182545,
+ 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.490392640201615,
+ 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.097545161008064
+};
+
+static const float adst_8[64] = {
+ 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.326790388032145,
+ 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.483002021635509,
+ 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.387095214016349,
+ 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.466553967085785,
+ 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.255357107325376,
+ -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.434217976756762,
+ 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.434217976756762,
+ 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.387095214016348,
+ 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.175227946595735,
+ 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.326790388032145,
+ 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.466553967085786,
+ -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.255357107325375,
+ 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.089131608307534,
+ -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.175227946595736,
+ 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.483002021635509,
+ 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.089131608307532
+};
+
+/* Converted the transforms to integers. */
+static const int16_t dct_i4[16] = {
+ 16384, 16384, 16384, 16384,
+ 21407, 8867, -8867, -21407,
+ 16384, -16384, -16384, 16384,
+ 8867, -21407, 21407, -8867
+};
+
+static const int16_t adst_i4[16] = {
+ 7472, 14042, 18919, 21513,
+ 18919, 18919, 0, -18919,
+ 21513, -7472, -18919, 14042,
+ 14042, -21513, 18919, -7472
+};
+
+static const int16_t dct_i8[64] = {
+ 11585, 11585, 11585, 11585,
+ 11585, 11585, 11585, 11585,
+ 16069, 13623, 9102, 3196,
+ -3196, -9102, -13623, -16069,
+ 15137, 6270, -6270, -15137,
+ -15137, -6270, 6270, 15137,
+ 13623, -3196, -16069, -9102,
+ 9102, 16069, 3196, -13623,
+ 11585, -11585, -11585, 11585,
+ 11585, -11585, -11585, 11585,
+ 9102, -16069, 3196, 13623,
+ -13623, -3196, 16069, -9102,
+ 6270, -15137, 15137, -6270,
+ -6270, 15137, -15137, 6270,
+ 3196, -9102, 13623, -16069,
+ 16069, -13623, 9102, -3196
+};
+
+static const int16_t adst_i8[64] = {
+ 2921, 5742, 8368, 10708,
+ 12684, 14228, 15288, 15827,
+ 8368, 14228, 15827, 12684,
+ 5742, -2921, -10708, -15288,
+ 12684, 15288, 5742, -8368,
+ -15827, -10708, 2921, 14228,
+ 15288, 8368, -10708, -14228,
+ 2921, 15827, 5742, -12684,
+ 15827, -2921, -15288, 5742,
+ 14228, -8368, -12684, 10708,
+ 14228, -12684, -2921, 15288,
+ -10708, -5742, 15827, -8368,
+ 10708, -15827, 12684, -2921,
+ -8368, 15288, -14228, 5742,
+ 5742, -10708, 14228, -15827,
+ 15288, -12684, 8368, -2921
+};
+
+static const float dct_16[256] = {
+ 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
+ 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
+ 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0.034654,
+ -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
+ 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
+ -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0.346760,
+ 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
+ 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0.338330,
+ 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
+ 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
+ 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0.166664,
+ -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0.311806,
+ 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0.293969,
+ -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0.293969,
+ 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0.224292,
+ 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0.273300,
+ 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000,
+ 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000,
+ 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0.273300,
+ -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0.224292,
+ 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0.196424,
+ -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0.196424,
+ 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0.311806,
+ 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0.166664,
+ 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299,
+ 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299,
+ 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0.338330,
+ -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0.102631,
+ 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0.068975,
+ -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0.068975,
+ 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0.351851,
+ 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0.034654
+};
+
+static const float adst_16[256] = {
+ 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0.240255,
+ 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0.347761,
+ 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0.263118,
+ 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,
+ 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0.215215,
+ -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0.338341,
+ 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0.283599,
+ -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0.329007,
+ 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0.188227,
+ 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0.316693,
+ 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511,
+ 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0.301511,
+ 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0.159534,
+ -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0.283599,
+ 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0.316693,
+ 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0.263118,
+ 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0.129396,
+ 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0.240255,
+ 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0.329007,
+ -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0.215215,
+ 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0.098087,
+ -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0.188227,
+ 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0.338341,
+ 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0.159534,
+ 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0.065889,
+ 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0.129396,
+ 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0.344612,
+ -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0.098087,
+ 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0.033094,
+ -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0.065889,
+ 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0.347761,
+ 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0.033094
+};
+
+/* Converted the transforms to integers. */
+static const int16_t dct_i16[256] = {
+ 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
+ 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
+ 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136,
+ -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529,
+ 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363,
+ -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363,
+ 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363,
+ 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086,
+ 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
+ 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
+ 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461,
+ -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217,
+ 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633,
+ -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633,
+ 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350,
+ 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955,
+ 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
+ 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
+ 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955,
+ -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350,
+ 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436,
+ -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436,
+ 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217,
+ 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461,
+ 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
+ 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
+ 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086,
+ -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363,
+ 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260,
+ -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260,
+ 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529,
+ 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136
+};
+
+static const int16_t adst_i16[256] = {
+ 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873,
+ 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395,
+ 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622,
+ 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292,
+ 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052,
+ -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087,
+ 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293,
+ -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781,
+ 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168,
+ 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377,
+ 9880, 9880, 0, -9880, -9880, 0, 9880, 9880,
+ 0, -9880, -9880, 0, 9880, 9880, 0, -9880,
+ 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228,
+ -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293,
+ 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377,
+ 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622,
+ 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240,
+ 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873,
+ 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781,
+ -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052,
+ 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214,
+ -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168,
+ 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087,
+ 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228,
+ 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159,
+ 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240,
+ 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292,
+ -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214,
+ 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084,
+ -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159,
+ 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395,
+ 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084
+};
+
+static const int xC1S7 = 16069;
+static const int xC2S6 = 15137;
+static const int xC3S5 = 13623;
+static const int xC4S4 = 11585;
+static const int xC5S3 = 9102;
+static const int xC6S2 = 6270;
+static const int xC7S1 = 3196;
+
+#define SHIFT_BITS 14
+#define DOROUND(X) X += (1<<(SHIFT_BITS-1));
+
+#define FINAL_SHIFT 3
+#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
+#define IN_SHIFT (FINAL_SHIFT+1)
+
+
+void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
+ int loop;
+ int short_pitch = pitch >> 1;
+ int is07, is12, is34, is56;
+ int is0734, is1256;
+ int id07, id12, id34, id56;
+ int irot_input_x, irot_input_y;
+ int icommon_product1; // Re-used product (c4s4 * (s12 - s56))
+ int icommon_product2; // Re-used product (c4s4 * (d12 + d56))
+ int temp1, temp2; // intermediate variable for computation
+
+ int InterData[64];
+ int *ip = InterData;
+ short *op = OutputData;
+
+ for (loop = 0; loop < 8; loop++) {
+ // Pre calculate some common sums and differences.
+ is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
+ is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
+ is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
+ is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
+ id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
+ id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
+ id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
+ id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
+
+ is0734 = is07 + is34;
+ is1256 = is12 + is56;
+
+ // Pre-Calculate some common product terms.
+ icommon_product1 = xC4S4 * (is12 - is56);
+ DOROUND(icommon_product1)
+ icommon_product1 >>= SHIFT_BITS;
+
+ icommon_product2 = xC4S4 * (id12 + id56);
+ DOROUND(icommon_product2)
+ icommon_product2 >>= SHIFT_BITS;
+
+
+ ip[0] = (xC4S4 * (is0734 + is1256));
+ DOROUND(ip[0]);
+ ip[0] >>= SHIFT_BITS;
+
+ ip[4] = (xC4S4 * (is0734 - is1256));
+ DOROUND(ip[4]);
+ ip[4] >>= SHIFT_BITS;
+
+ // Define inputs to rotation for outputs 2 and 6
+ irot_input_x = id12 - id56;
+ irot_input_y = is07 - is34;
+
+ // Apply rotation for outputs 2 and 6.
+ temp1 = xC6S2 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC2S6 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[2] = temp1 + temp2;
+
+ temp1 = xC6S2 * irot_input_y;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC2S6 * irot_input_x;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[6] = temp1 - temp2;
+
+ // Define inputs to rotation for outputs 1 and 7
+ irot_input_x = icommon_product1 + id07;
+ irot_input_y = -(id34 + icommon_product2);
+
+ // Apply rotation for outputs 1 and 7.
+ temp1 = xC1S7 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC7S1 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[1] = temp1 - temp2;
+
+ temp1 = xC7S1 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC1S7 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[7] = temp1 + temp2;
+
+ // Define inputs to rotation for outputs 3 and 5
+ irot_input_x = id07 - icommon_product1;
+ irot_input_y = id34 - icommon_product2;
+
+ // Apply rotation for outputs 3 and 5.
+ temp1 = xC3S5 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC5S3 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[3] = temp1 - temp2;
+
+
+ temp1 = xC5S3 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC3S5 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ ip[5] = temp1 + temp2;
+
+ // Increment data pointer for next row
+ InputData += short_pitch;
+ ip += 8;
+ }
+
+ // Performed DCT on rows, now transform the columns
+ ip = InterData;
+ for (loop = 0; loop < 8; loop++) {
+ // Pre calculate some common sums and differences.
+ is07 = ip[0 * 8] + ip[7 * 8];
+ is12 = ip[1 * 8] + ip[2 * 8];
+ is34 = ip[3 * 8] + ip[4 * 8];
+ is56 = ip[5 * 8] + ip[6 * 8];
+
+ id07 = ip[0 * 8] - ip[7 * 8];
+ id12 = ip[1 * 8] - ip[2 * 8];
+ id34 = ip[3 * 8] - ip[4 * 8];
+ id56 = ip[5 * 8] - ip[6 * 8];
+
+ is0734 = is07 + is34;
+ is1256 = is12 + is56;
+
+ // Pre-Calculate some common product terms
+ icommon_product1 = xC4S4 * (is12 - is56);
+ icommon_product2 = xC4S4 * (id12 + id56);
+ DOROUND(icommon_product1)
+ DOROUND(icommon_product2)
+ icommon_product1 >>= SHIFT_BITS;
+ icommon_product2 >>= SHIFT_BITS;
+
+
+ temp1 = xC4S4 * (is0734 + is1256);
+ temp2 = xC4S4 * (is0734 - is1256);
+ DOROUND(temp1);
+ DOROUND(temp2);
+ temp1 >>= SHIFT_BITS;
+
+ temp2 >>= SHIFT_BITS;
+ op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
+ op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ // Define inputs to rotation for outputs 2 and 6
+ irot_input_x = id12 - id56;
+ irot_input_y = is07 - is34;
+
+ // Apply rotation for outputs 2 and 6.
+ temp1 = xC6S2 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC2S6 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ temp1 = xC6S2 * irot_input_y;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC2S6 * irot_input_x;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ // Define inputs to rotation for outputs 1 and 7
+ irot_input_x = icommon_product1 + id07;
+ irot_input_y = -(id34 + icommon_product2);
+
+ // Apply rotation for outputs 1 and 7.
+ temp1 = xC1S7 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC7S1 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ temp1 = xC7S1 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC1S7 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ // Define inputs to rotation for outputs 3 and 5
+ irot_input_x = id07 - icommon_product1;
+ irot_input_y = id34 - icommon_product2;
+
+ // Apply rotation for outputs 3 and 5.
+ temp1 = xC3S5 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC5S3 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+
+ temp1 = xC5S3 * irot_input_x;
+ DOROUND(temp1);
+ temp1 >>= SHIFT_BITS;
+ temp2 = xC3S5 * irot_input_y;
+ DOROUND(temp2);
+ temp2 >>= SHIFT_BITS;
+ op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
+
+ // Increment data pointer for next column.
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
+ /* [1 1; 1 -1] orthogonal transform */
+ /* use position: 0,1, 4, 8 */
+ int i;
+ short *ip1 = input;
+ short *op1 = output;
+ for (i = 0; i < 16; i++) {
+ op1[i] = 0;
+ }
+
+ op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
+ op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
+ op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
+ op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
+}
+
+/* For test */
+#define TEST_INT 1
+#if TEST_INT
+#define vp9_fht_int_c vp9_fht_c
+#else
+#define vp9_fht_float_c vp9_fht_c
+#endif
+
+void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
+ TX_TYPE tx_type, int tx_dim) {
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ int i, j, k;
+ float bufa[256], bufb[256]; // buffers are for floating-point test purpose
+ // the implementation could be simplified in
+ // conjunction with integer transform
+ const int16_t *ip = input;
+ int16_t *op = output;
+
+ float *pfa = &bufa[0];
+ float *pfb = &bufb[0];
+
+ // pointers to vertical and horizontal transforms
+ const float *ptv, *pth;
+
+ assert(tx_type != DCT_DCT);
+ // load and convert residual array into floating-point
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ pfa[i] = (float)ip[i];
+ }
+ pfa += tx_dim;
+ ip += pitch / 2;
+ }
+
+ // vertical transformation
+ pfa = &bufa[0];
+ pfb = &bufb[0];
+
+ switch (tx_type) {
+ case ADST_ADST :
+ case ADST_DCT :
+ ptv = (tx_dim == 4) ? &adst_4[0] :
+ ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+ break;
+
+ default :
+ ptv = (tx_dim == 4) ? &dct_4[0] :
+ ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+ break;
+ }
+
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ pfb[i] = 0;
+ for (k = 0; k < tx_dim; k++) {
+ pfb[i] += ptv[k] * pfa[(k * tx_dim)];
+ }
+ pfa += 1;
+ }
+ pfb += tx_dim;
+ ptv += tx_dim;
+ pfa = &bufa[0];
+ }
+
+ // horizontal transformation
+ pfa = &bufa[0];
+ pfb = &bufb[0];
+
+ switch (tx_type) {
+ case ADST_ADST :
+ case DCT_ADST :
+ pth = (tx_dim == 4) ? &adst_4[0] :
+ ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+ break;
+
+ default :
+ pth = (tx_dim == 4) ? &dct_4[0] :
+ ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+ break;
+ }
+
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ pfa[i] = 0;
+ for (k = 0; k < tx_dim; k++) {
+ pfa[i] += pfb[k] * pth[k];
+ }
+ pth += tx_dim;
+ }
+
+ pfa += tx_dim;
+ pfb += tx_dim;
+ // pth -= tx_dim * tx_dim;
+
+ switch (tx_type) {
+ case ADST_ADST :
+ case DCT_ADST :
+ pth = (tx_dim == 4) ? &adst_4[0] :
+ ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
+ break;
+
+ default :
+ pth = (tx_dim == 4) ? &dct_4[0] :
+ ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
+ break;
+ }
+ }
+
+ // convert to short integer format and load BLOCKD buffer
+ op = output;
+ pfa = &bufa[0];
+
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
+ -(int16_t)(- 8 * pfa[i] + 0.49);
+ }
+ op += tx_dim;
+ pfa += tx_dim;
+ }
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+/* Converted the transforms to integer form. */
+#define VERTICAL_SHIFT 11
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
+#define HORIZONTAL_SHIFT 16
+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
+ TX_TYPE tx_type, int tx_dim) {
+ int i, j, k;
+ int16_t imbuf[256];
+
+ const int16_t *ip = input;
+ int16_t *op = output;
+ int16_t *im = &imbuf[0];
+
+ /* pointers to vertical and horizontal transforms. */
+ const int16_t *ptv = NULL, *pth = NULL;
+
+ switch (tx_type) {
+ case ADST_ADST :
+ ptv = pth = (tx_dim == 4) ? &adst_i4[0]
+ : ((tx_dim == 8) ? &adst_i8[0]
+ : &adst_i16[0]);
+ break;
+ case ADST_DCT :
+ ptv = (tx_dim == 4) ? &adst_i4[0]
+ : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+ pth = (tx_dim == 4) ? &dct_i4[0]
+ : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+ break;
+ case DCT_ADST :
+ ptv = (tx_dim == 4) ? &dct_i4[0]
+ : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+ pth = (tx_dim == 4) ? &adst_i4[0]
+ : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
+ break;
+ case DCT_DCT :
+ ptv = pth = (tx_dim == 4) ? &dct_i4[0]
+ : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ /* vertical transformation */
+ for (j = 0; j < tx_dim; j++) {
+ for (i = 0; i < tx_dim; i++) {
+ int temp = 0;
+
+ for (k = 0; k < tx_dim; k++) {
+ temp += ptv[k] * ip[(k * (pitch >> 1))];
+ }
+
+ im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+ ip++;
+ }
+ im += tx_dim; // 16
+ ptv += tx_dim;
+ ip = input;
+ }
+
+ /* horizontal transformation */
+ im = &imbuf[0];
+
+ for (j = 0; j < tx_dim; j++) {
+ const int16_t *pthc = pth;
+
+ for (i = 0; i < tx_dim; i++) {
+ int temp = 0;
+
+ for (k = 0; k < tx_dim; k++) {
+ temp += im[k] * pthc[k];
+ }
+
+ op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+ pthc += tx_dim;
+ }
+
+ im += tx_dim; // 16
+ op += tx_dim;
+ }
+}
+
+void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ((ip[0] + ip[3]) << 5);
+ b1 = ((ip[1] + ip[2]) << 5);
+ c1 = ((ip[1] - ip[2]) << 5);
+ d1 = ((ip[0] - ip[3]) << 5);
+
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
+
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
+
+ ip += pitch / 2;
+ op += 4;
+
+ }
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = (a1 + b1 + 7) >> 4;
+ op[8] = (a1 - b1 + 7) >> 4;
+
+ op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
+
+ ip++;
+ op++;
+ }
+}
+
+void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+ vp9_short_fdct4x4_c(input, output, pitch);
+ vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+ int pitch_short = pitch >> 1;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
+ b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
+ c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
+ d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[4] = (c1 + d1) >> 1;
+ op[8] = (a1 - b1) >> 1;
+ op[12] = (d1 - c1) >> 1;
+
+ ip++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[1] = (c1 + d1) >> 1;
+ op[2] = (a1 - b1) >> 1;
+ op[3] = (d1 - c1) >> 1;
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+#if CONFIG_LOSSLESS
+void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+ int pitch_short = pitch >> 1;
+
+ for (i = 0; i < 4; i++) {
+ a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+ b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+ c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+ d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[4] = (c1 + d1) >> 1;
+ op[8] = (a1 - b1) >> 1;
+ op[12] = (d1 - c1) >> 1;
+
+ ip++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+ op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+ int pitch_short = pitch >> 1;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
+ b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
+ c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
+ d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+
+ op[0] = (a1 + b1 + 1) >> 1;
+ op[4] = (c1 + d1) >> 1;
+ op[8] = (a1 - b1) >> 1;
+ op[12] = (d1 - c1) >> 1;
+
+ ip++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
+ op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
+ op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
+ op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
+ vp9_short_walsh4x4_x8_c(input, output, pitch);
+ vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
+}
+#endif
+
+static const double C1 = 0.995184726672197;
+static const double C2 = 0.98078528040323;
+static const double C3 = 0.956940335732209;
+static const double C4 = 0.923879532511287;
+static const double C5 = 0.881921264348355;
+static const double C6 = 0.831469612302545;
+static const double C7 = 0.773010453362737;
+static const double C8 = 0.707106781186548;
+static const double C9 = 0.634393284163646;
+static const double C10 = 0.555570233019602;
+static const double C11 = 0.471396736825998;
+static const double C12 = 0.38268343236509;
+static const double C13 = 0.290284677254462;
+static const double C14 = 0.195090322016128;
+static const double C15 = 0.098017140329561;
+
+static void dct16x16_1d(double input[16], double output[16]) {
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ double step[16];
+ double intermediate[16];
+ double temp1, temp2;
+
+ // step 1
+ step[ 0] = input[0] + input[15];
+ step[ 1] = input[1] + input[14];
+ step[ 2] = input[2] + input[13];
+ step[ 3] = input[3] + input[12];
+ step[ 4] = input[4] + input[11];
+ step[ 5] = input[5] + input[10];
+ step[ 6] = input[6] + input[ 9];
+ step[ 7] = input[7] + input[ 8];
+ step[ 8] = input[7] - input[ 8];
+ step[ 9] = input[6] - input[ 9];
+ step[10] = input[5] - input[10];
+ step[11] = input[4] - input[11];
+ step[12] = input[3] - input[12];
+ step[13] = input[2] - input[13];
+ step[14] = input[1] - input[14];
+ step[15] = input[0] - input[15];
+
+ // step 2
+ output[0] = step[0] + step[7];
+ output[1] = step[1] + step[6];
+ output[2] = step[2] + step[5];
+ output[3] = step[3] + step[4];
+ output[4] = step[3] - step[4];
+ output[5] = step[2] - step[5];
+ output[6] = step[1] - step[6];
+ output[7] = step[0] - step[7];
+
+ temp1 = step[ 8]*C7;
+ temp2 = step[15]*C9;
+ output[ 8] = temp1 + temp2;
+
+ temp1 = step[ 9]*C11;
+ temp2 = step[14]*C5;
+ output[ 9] = temp1 - temp2;
+
+ temp1 = step[10]*C3;
+ temp2 = step[13]*C13;
+ output[10] = temp1 + temp2;
+
+ temp1 = step[11]*C15;
+ temp2 = step[12]*C1;
+ output[11] = temp1 - temp2;
+
+ temp1 = step[11]*C1;
+ temp2 = step[12]*C15;
+ output[12] = temp2 + temp1;
+
+ temp1 = step[10]*C13;
+ temp2 = step[13]*C3;
+ output[13] = temp2 - temp1;
+
+ temp1 = step[ 9]*C5;
+ temp2 = step[14]*C11;
+ output[14] = temp2 + temp1;
+
+ temp1 = step[ 8]*C9;
+ temp2 = step[15]*C7;
+ output[15] = temp2 - temp1;
+
+ // step 3
+ step[ 0] = output[0] + output[3];
+ step[ 1] = output[1] + output[2];
+ step[ 2] = output[1] - output[2];
+ step[ 3] = output[0] - output[3];
+
+ temp1 = output[4]*C14;
+ temp2 = output[7]*C2;
+ step[ 4] = temp1 + temp2;
+
+ temp1 = output[5]*C10;
+ temp2 = output[6]*C6;
+ step[ 5] = temp1 + temp2;
+
+ temp1 = output[5]*C6;
+ temp2 = output[6]*C10;
+ step[ 6] = temp2 - temp1;
+
+ temp1 = output[4]*C2;
+ temp2 = output[7]*C14;
+ step[ 7] = temp2 - temp1;
+
+ step[ 8] = output[ 8] + output[11];
+ step[ 9] = output[ 9] + output[10];
+ step[10] = output[ 9] - output[10];
+ step[11] = output[ 8] - output[11];
+
+ step[12] = output[12] + output[15];
+ step[13] = output[13] + output[14];
+ step[14] = output[13] - output[14];
+ step[15] = output[12] - output[15];
+
+ // step 4
+ output[ 0] = (step[ 0] + step[ 1]);
+ output[ 8] = (step[ 0] - step[ 1]);
+
+ temp1 = step[2]*C12;
+ temp2 = step[3]*C4;
+ temp1 = temp1 + temp2;
+ output[ 4] = 2*(temp1*C8);
+
+ temp1 = step[2]*C4;
+ temp2 = step[3]*C12;
+ temp1 = temp2 - temp1;
+ output[12] = 2*(temp1*C8);
+
+ output[ 2] = 2*((step[4] + step[ 5])*C8);
+ output[14] = 2*((step[7] - step[ 6])*C8);
+
+ temp1 = step[4] - step[5];
+ temp2 = step[6] + step[7];
+ output[ 6] = (temp1 + temp2);
+ output[10] = (temp1 - temp2);
+
+ intermediate[8] = step[8] + step[14];
+ intermediate[9] = step[9] + step[15];
+
+ temp1 = intermediate[8]*C12;
+ temp2 = intermediate[9]*C4;
+ temp1 = temp1 - temp2;
+ output[3] = 2*(temp1*C8);
+
+ temp1 = intermediate[8]*C4;
+ temp2 = intermediate[9]*C12;
+ temp1 = temp2 + temp1;
+ output[13] = 2*(temp1*C8);
+
+ output[ 9] = 2*((step[10] + step[11])*C8);
+
+ intermediate[11] = step[10] - step[11];
+ intermediate[12] = step[12] + step[13];
+ intermediate[13] = step[12] - step[13];
+ intermediate[14] = step[ 8] - step[14];
+ intermediate[15] = step[ 9] - step[15];
+
+ output[15] = (intermediate[11] + intermediate[12]);
+ output[ 1] = -(intermediate[11] - intermediate[12]);
+
+ output[ 7] = 2*(intermediate[13]*C8);
+
+ temp1 = intermediate[14]*C12;
+ temp2 = intermediate[15]*C4;
+ temp1 = temp1 - temp2;
+ output[11] = -2*(temp1*C8);
+
+ temp1 = intermediate[14]*C4;
+ temp2 = intermediate[15]*C12;
+ temp1 = temp2 + temp1;
+ output[ 5] = 2*(temp1*C8);
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
+
+void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+ {
+ int shortpitch = pitch >> 1;
+ int i, j;
+ double output[256];
+ // First transform columns
+ for (i = 0; i < 16; i++) {
+ double temp_in[16], temp_out[16];
+ for (j = 0; j < 16; j++)
+ temp_in[j] = input[j*shortpitch + i];
+ dct16x16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; j++)
+ output[j*16 + i] = temp_out[j];
+ }
+ // Then transform rows
+ for (i = 0; i < 16; ++i) {
+ double temp_in[16], temp_out[16];
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = output[j + i*16];
+ dct16x16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ output[j + i*16] = temp_out[j];
+ }
+ // Scale by some magic number
+ for (i = 0; i < 256; i++)
+ out[i] = (short)round(output[i]/2);
+ }
+ vp9_clear_system_state(); // Make it simd safe : __asm emms;
+}
--- /dev/null
+++ b/vp9/encoder/encodeframe.c
@@ -1,0 +1,2342 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "vp9/common/common.h"
+#include "onyx_int.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vp9/common/setupintrarecon.h"
+#include "vp9/common/reconintra4x4.h"
+#include "encodeintra.h"
+#include "vp9/common/reconinter.h"
+#include "vp9/common/invtrans.h"
+#include "rdopt.h"
+#include "vp9/common/findnearmv.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/seg_common.h"
+#include "vpx_rtcd.h"
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include "vp9/common/subpixel.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vp9/common/pred_common.h"
+
+#define DBG_PRNT_SEGMAP 0
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD(x) &cpi->common.rtcd.x
+#define IF_RTCD(x) (x)
+#else
+#define RTCD(x) NULL
+#define IF_RTCD(x) NULL
+#endif
+
+#ifdef ENC_DEBUG
+int enc_debug = 0;
+int mb_row_debug, mb_col_debug;
+#endif
+
+extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);
+
+extern void vp9_auto_select_speed(VP9_COMP *cpi);
+
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset, int recon_uvoffset,
+ int *returnrate, int *returndistortion);
+
+extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset,
+ int recon_uvoffset, int *r, int *d);
+
+void vp9_build_block_offsets(MACROBLOCK *x);
+
+void vp9_setup_block_ptrs(MACROBLOCK *x);
+
+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset,
+ int output_enabled);
+
+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset,
+ int mb_col, int mb_row);
+
+void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t, int output_enabled);
+
+void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t, int mb_col);
+
+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[MB_MODE_COUNT];
+unsigned int inter_uv_modes[VP9_UV_MODES];
+unsigned int inter_b_modes[B_MODE_COUNT];
+unsigned int y_modes[VP9_YMODES];
+unsigned int i8x8_modes[VP9_I8X8_MODES];
+unsigned int uv_modes[VP9_UV_MODES];
+unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
+unsigned int b_modes[B_MODE_COUNT];
+#endif
+
+
+/* activity_avg must be positive, or flat regions could get a zero weight
+ * (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ * vp9_activity_masking().
+ */
+#define VP9_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ * purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ * which will be faster.
+ */
+static const unsigned char VP9_VAR_OFFS[16] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+
+// Original activity measure from Tim T's code.
+static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
+ unsigned int act;
+ unsigned int sse;
+ /* TODO: This could also be done over smaller areas (8x8), but that would
+ * require extensive changes elsewhere, as lambda is assumed to be fixed
+ * over an entire MB in most of the code.
+ * Another option is to compute four 8x8 variances, and pick a single
+ * lambda using a non-linear combination (e.g., the smallest, or second
+ * smallest, etc.).
+ */
+ act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
+ &sse);
+ act = act << 4;
+
+ /* If the region is flat, lower the activity some more. */
+ if (act < 8 << 12)
+ act = act < 5 << 12 ? act : 5 << 12;
+
+ return act;
+}
+
+// Stub for alternative experimental activity measures.
+static unsigned int alt_activity_measure(VP9_COMP *cpi,
+ MACROBLOCK *x, int use_dc_pred) {
+ return vp9_encode_intra(cpi, x, use_dc_pred);
+}
+
+
+// Measure the activity of the current macroblock
+// What we measure here is TBD so abstracted to this function
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+ int mb_row, int mb_col) {
+ unsigned int mb_activity;
+
+ if (ALT_ACT_MEASURE) {
+ int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+ // Or use and alternative.
+ mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+ } else {
+ // Original activity measure from Tim T's code.
+ mb_activity = tt_activity_measure(cpi, x);
+ }
+
+ if (mb_activity < VP9_ACTIVITY_AVG_MIN)
+ mb_activity = VP9_ACTIVITY_AVG_MIN;
+
+ return mb_activity;
+}
+
+// Calculate an "average" mb activity value for the frame
+#define ACT_MEDIAN 0
+static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
+#if ACT_MEDIAN
+ // Find median: Simple n^2 algorithm for experimentation
+ {
+ unsigned int median;
+ unsigned int i, j;
+ unsigned int *sortlist;
+ unsigned int tmp;
+
+ // Create a list to sort to
+ CHECK_MEM_ERROR(sortlist,
+ vpx_calloc(sizeof(unsigned int),
+ cpi->common.MBs));
+
+ // Copy map to sort list
+ vpx_memcpy(sortlist, cpi->mb_activity_map,
+ sizeof(unsigned int) * cpi->common.MBs);
+
+
+ // Ripple each value down to its correct position
+ for (i = 1; i < cpi->common.MBs; i ++) {
+ for (j = i; j > 0; j --) {
+ if (sortlist[j] < sortlist[j - 1]) {
+ // Swap values
+ tmp = sortlist[j - 1];
+ sortlist[j - 1] = sortlist[j];
+ sortlist[j] = tmp;
+ } else
+ break;
+ }
+ }
+
+ // Even number MBs so estimate median as mean of two either side.
+ median = (1 + sortlist[cpi->common.MBs >> 1] +
+ sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+
+ cpi->activity_avg = median;
+
+ vpx_free(sortlist);
+ }
+#else
+ // Simple mean for now
+ cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+#endif
+
+ if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
+ cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;
+
+ // Experimental code: return fixed value normalized for several clips
+ if (ALT_ACT_MEASURE)
+ cpi->activity_avg = 100000;
+}
+
+#define USE_ACT_INDEX 0
+#define OUTPUT_NORM_ACT_STATS 0
+
+#if USE_ACT_INDEX
+// Calculate and activity index for each mb
+static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_row, mb_col;
+
+ int64_t act;
+ int64_t a;
+ int64_t b;
+
+#if OUTPUT_NORM_ACT_STATS
+ FILE *f = fopen("norm_act.stt", "a");
+ fprintf(f, "\n%12d\n", cpi->activity_avg);
+#endif
+
+ // Reset pointers to start of activity map
+ x->mb_activity_ptr = cpi->mb_activity_map;
+
+ // Calculate normalized mb activity number.
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ // Read activity from the map
+ act = *(x->mb_activity_ptr);
+
+ // Calculate a normalized activity number
+ a = act + 4 * cpi->activity_avg;
+ b = 4 * act + cpi->activity_avg;
+
+ if (b >= a)
+ *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+ else
+ *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+
+#if OUTPUT_NORM_ACT_STATS
+ fprintf(f, " %6d", *(x->mb_activity_ptr));
+#endif
+ // Increment activity map pointers
+ x->mb_activity_ptr++;
+ }
+
+#if OUTPUT_NORM_ACT_STATS
+ fprintf(f, "\n");
+#endif
+
+ }
+
+#if OUTPUT_NORM_ACT_STATS
+ fclose(f);
+#endif
+
+}
+#endif
+
+// Loop through all MBs. Note activity of each, average activity and
+// calculate a normalized activity for each
+static void build_activity_map(VP9_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+
+#if ALT_ACT_MEASURE
+ YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ int recon_yoffset;
+ int recon_y_stride = new_yv12->y_stride;
+#endif
+
+ int mb_row, mb_col;
+ unsigned int mb_activity;
+ int64_t activity_sum = 0;
+
+ // for each macroblock row in image
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+#if ALT_ACT_MEASURE
+ // reset above block coeffs
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+#if ALT_ACT_MEASURE
+ xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+ xd->left_available = (mb_col != 0);
+ recon_yoffset += 16;
+#endif
+ // Copy current mb to a buffer
+ vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ // measure activity
+ mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+
+ // Keep frame sum
+ activity_sum += mb_activity;
+
+ // Store MB level activity details.
+ *x->mb_activity_ptr = mb_activity;
+
+ // Increment activity map pointer
+ x->mb_activity_ptr++;
+
+ // adjust to the next column of source macroblocks
+ x->src.y_buffer += 16;
+ }
+
+
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+ // extend the recon for intra prediction
+ vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
+ }
+
+ // Calculate an "average" MB activity
+ calc_av_activity(cpi, activity_sum);
+
+#if USE_ACT_INDEX
+ // Calculate an activity index number of each mb
+ calc_activity_index(cpi, x);
+#endif
+
+}
+
+// Macroblock activity masking
+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
+#if USE_ACT_INDEX
+ x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+ x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
+ x->errorperbit += (x->errorperbit == 0);
+#else
+ int64_t a;
+ int64_t b;
+ int64_t act = *(x->mb_activity_ptr);
+
+ // Apply the masking to the RD multiplier.
+ a = act + (2 * cpi->activity_avg);
+ b = (2 * act) + cpi->activity_avg;
+
+ x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+ x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
+ x->errorperbit += (x->errorperbit == 0);
+#endif
+
+ // Activity based Zbin adjustment
+ adjust_act_zbin(cpi, x);
+}
+
+static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ int i;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = &ctx->mic;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+ int mb_mode = mi->mbmi.mode;
+ int mb_mode_index = ctx->best_mode_index;
+
+#if CONFIG_DEBUG
+ assert(mb_mode < MB_MODE_COUNT);
+ assert(mb_mode_index < MAX_MODES);
+ assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+#endif
+
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+ vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+ if (mi->mbmi.encoded_as_sb) {
+ const int mis = cpi->common.mode_info_stride;
+ if (xd->mb_to_right_edge > 0)
+ vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+ if (xd->mb_to_bottom_edge > 0) {
+ vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
+ if (xd->mb_to_right_edge > 0)
+ vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
+ }
+ }
+#endif
+
+ if (mb_mode == B_PRED) {
+ for (i = 0; i < 16; i++) {
+ xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
+ assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);
+ }
+ } else if (mb_mode == I8X8_PRED) {
+ for (i = 0; i < 16; i++) {
+ xd->block[i].bmi = xd->mode_info_context->bmi[i];
+ }
+ } else if (mb_mode == SPLITMV) {
+ vpx_memcpy(x->partition_info, &ctx->partition_info,
+ sizeof(PARTITION_INFO));
+
+ mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
+ mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+ }
+
+ {
+ int segment_id = mbmi->segment_id;
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
+ }
+ }
+ }
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ // Restore the coding modes to that held in the coding context
+ // if (mb_mode == B_PRED)
+ // for (i = 0; i < 16; i++)
+ // {
+ // xd->block[i].bmi.as_mode =
+ // xd->mode_info_context->bmi[i].as_mode;
+ // assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);
+ // }
+#if CONFIG_INTERNAL_STATS
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/,
+ THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/,
+ THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/,
+ THR_D117_PRED /*D117_PRED*/,
+ THR_D153_PRED /*D153_PRED*/,
+ THR_D27_PRED /*D27_PRED*/,
+ THR_D63_PRED /*D63_PRED*/,
+ THR_TM /*TM_PRED*/,
+ THR_I8X8_PRED /*I8X8_PRED*/,
+ THR_B_PRED /*B_PRED*/,
+ };
+ cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
+#endif
+ } else {
+ /*
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
+ (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[mb_mode_index] =
+ (cpi->rd_thresh_mult[mb_mode_index]
+ >= (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
+ MIN_THRESHMULT;
+ cpi->rd_threshes[mb_mode_index] =
+ (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
+ * cpi->rd_thresh_mult[mb_mode_index];
+
+ }
+ */
+ // Note how often each mode chosen as best
+ cpi->mode_chosen_counts[mb_mode_index]++;
+
+ cpi->prediction_error += ctx->distortion;
+ cpi->intra_error += ctx->intra_error;
+
+ cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+ cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+ cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
+ }
+}
+
+static void pick_mb_modes(VP9_COMP *cpi,
+ VP9_COMMON *cm,
+ int mb_row,
+ int mb_col,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp,
+ int *totalrate,
+ int *totaldist) {
+ int i;
+ int map_index;
+ int recon_yoffset, recon_uvoffset;
+ int ref_fb_idx = cm->lst_fb_idx;
+ int dst_fb_idx = cm->new_fb_idx;
+ int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+ ENTROPY_CONTEXT_PLANES left_context[2];
+ ENTROPY_CONTEXT_PLANES above_context[2];
+ ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+ + mb_col;
+
+ // Offsets to move pointers from MB to MB within a SB in raster order
+ int row_delta[4] = { 0, +1, 0, -1};
+ int col_delta[4] = { +1, -1, +1, +1};
+
+ /* Function should not modify L & A contexts; save and restore on exit */
+ vpx_memcpy(left_context,
+ cm->left_context,
+ sizeof(left_context));
+ vpx_memcpy(above_context,
+ initial_above_context_ptr,
+ sizeof(above_context));
+
+ /* Encode MBs in raster order within the SB */
+ for (i = 0; i < 4; i++) {
+ int dy = row_delta[i];
+ int dx = col_delta[i];
+ int offset_unextended = dy * cm->mb_cols + dx;
+ int offset_extended = dy * xd->mode_info_stride + dx;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+ // TODO Many of the index items here can be computed more efficiently!
+
+ if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+ // MB lies outside frame, move on
+ mb_row += dy;
+ mb_col += dx;
+
+ // Update pointers
+ x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+ x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
+ x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
+
+ x->gf_active_ptr += offset_unextended;
+ x->partition_info += offset_extended;
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+#if CONFIG_DEBUG
+ assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+ (xd->mode_info_context - cpi->common.mip));
+#endif
+ continue;
+ }
+
+ // Index of the MB in the SB 0..3
+ xd->mb_index = i;
+
+ map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+ x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+ // set above context pointer
+ xd->above_context = cm->above_context + mb_col;
+
+ // Restore the appropriate left context depending on which
+ // row in the SB the MB is situated
+ xd->left_context = cm->left_context + (i >> 1);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+ // Set up limit values for MV components to prevent them from
+ // extending beyond the UMV borders assuming 16x16 block size
+ x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+ (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+ x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+ (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+
+ xd->up_available = (mb_row != 0);
+ xd->left_available = (mb_col != 0);
+
+ recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+ xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+ // Copy current MB to a work buffer
+ vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ x->rddiv = cpi->RDDIV;
+ x->rdmult = cpi->RDMULT;
+
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ vp9_activity_masking(cpi, x);
+
+ // Is segmentation enabled
+ if (xd->segmentation_enabled) {
+ // Code to set segment id in xd->mbmi.segment_id
+ if (xd->update_mb_segmentation_map)
+ mbmi->segment_id = cpi->segmentation_map[map_index];
+ else
+ mbmi->segment_id = cm->last_frame_seg_map[map_index];
+ if (mbmi->segment_id > 3)
+ mbmi->segment_id = 0;
+
+ vp9_mb_init_quantizer(cpi, x);
+ } else
+ // Set to Segment 0 by default
+ mbmi->segment_id = 0;
+
+ x->active_ptr = cpi->active_map + map_index;
+
+#if CONFIG_SUPERBLOCKS
+ xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
+ cpi->update_context = 0; // TODO Do we need this now??
+
+ vp9_intra_prediction_down_copy(xd);
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (cm->frame_type == KEY_FRAME) {
+ int r, d;
+ vp9_rd_pick_intra_mode(cpi, x, &r, &d);
+ *totalrate += r;
+ *totaldist += d;
+
+ // Dummy encode, do not do the tokenization
+ vp9_encode_intra_macro_block(cpi, x, tp, 0);
+ // Note the encoder may have changed the segment_id
+
+ // Save the coding context
+ vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+ sizeof(MODE_INFO));
+ } else {
+ int seg_id, r, d;
+
+ if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+ !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+ vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+ vp9_check_segref(xd, 1, INTRA_FRAME) +
+ vp9_check_segref(xd, 1, LAST_FRAME) +
+ vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+ vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+ cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+ } else {
+ cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
+ }
+
+ vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+ recon_uvoffset, &r, &d);
+ *totalrate += r;
+ *totaldist += d;
+
+ // Dummy encode, do not do the tokenization
+ vp9_encode_inter_macroblock(cpi, x, tp,
+ recon_yoffset, recon_uvoffset, 0);
+
+ seg_id = mbmi->segment_id;
+ if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
+ cpi->seg0_idx++;
+ }
+ if (!xd->segmentation_enabled ||
+ !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||
+ vp9_check_segref(xd, seg_id, INTRA_FRAME) +
+ vp9_check_segref(xd, seg_id, LAST_FRAME) +
+ vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +
+ vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {
+ // Get the prediction context and status
+ int pred_flag = vp9_get_pred_flag(xd, PRED_REF);
+ int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);
+
+ // Count prediction success
+ cpi->ref_pred_count[pred_context][pred_flag]++;
+ }
+ }
+
+ // Next MB
+ mb_row += dy;
+ mb_col += dx;
+
+ x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+ x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
+ x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
+
+ x->gf_active_ptr += offset_unextended;
+ x->partition_info += offset_extended;
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+ assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+ (xd->mode_info_context - cpi->common.mip));
+#endif
+ }
+
+ /* Restore L & A coding context to those in place on entry */
+ vpx_memcpy(cm->left_context,
+ left_context,
+ sizeof(left_context));
+ vpx_memcpy(initial_above_context_ptr,
+ above_context,
+ sizeof(above_context));
+}
+
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP9_COMP *cpi,
+ VP9_COMMON *cm,
+ int mb_row,
+ int mb_col,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp,
+ int *totalrate,
+ int *totaldist)
+{
+ int map_index;
+ int recon_yoffset, recon_uvoffset;
+ int ref_fb_idx = cm->lst_fb_idx;
+ int dst_fb_idx = cm->new_fb_idx;
+ int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+ ENTROPY_CONTEXT_PLANES left_context[2];
+ ENTROPY_CONTEXT_PLANES above_context[2];
+ ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+ + mb_col;
+
+ /* Function should not modify L & A contexts; save and restore on exit */
+ vpx_memcpy (left_context,
+ cm->left_context,
+ sizeof(left_context));
+ vpx_memcpy (above_context,
+ initial_above_context_ptr,
+ sizeof(above_context));
+
+ map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+ x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+ /* set above context pointer */
+ xd->above_context = cm->above_context + mb_col;
+
+ /* Restore the appropriate left context depending on which
+ * row in the SB the MB is situated */
+ xd->left_context = cm->left_context;
+
+ // Set up distance of MB to edge of frame in 1/8th pel units
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+ /* Set up limit values for MV components to prevent them from
+ * extending beyond the UMV borders assuming 16x16 block size */
+ x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+ (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+ x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+ (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+ xd->up_available = (mb_row != 0);
+ xd->left_available = (mb_col != 0);
+
+ recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+ xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+ /* Copy current MB to a work buffer */
+ vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+#endif
+ x->rddiv = cpi->RDDIV;
+ x->rdmult = cpi->RDMULT;
+ if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ vp9_activity_masking(cpi, x);
+ /* Is segmentation enabled */
+ if (xd->segmentation_enabled)
+ {
+ /* Code to set segment id in xd->mbmi.segment_id */
+ if (xd->update_mb_segmentation_map)
+ xd->mode_info_context->mbmi.segment_id =
+ cpi->segmentation_map[map_index] &&
+ cpi->segmentation_map[map_index + 1] &&
+ cpi->segmentation_map[map_index + cm->mb_cols] &&
+ cpi->segmentation_map[map_index + cm->mb_cols + 1];
+ else
+ xd->mode_info_context->mbmi.segment_id =
+ cm->last_frame_seg_map[map_index] &&
+ cm->last_frame_seg_map[map_index + 1] &&
+ cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+ cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+ if (xd->mode_info_context->mbmi.segment_id > 3)
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ vp9_mb_init_quantizer(cpi, x);
+ }
+ else
+ /* Set to Segment 0 by default */
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ x->active_ptr = cpi->active_map + map_index;
+
+ cpi->update_context = 0; // TODO Do we need this now??
+
+ /* Find best coding mode & reconstruct the MB so it is available
+ * as a predictor for MBs that follow in the SB */
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp9_rd_pick_intra_mode_sb(cpi, x,
+ totalrate,
+ totaldist);
+
+ /* Save the coding context */
+ vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+ sizeof(MODE_INFO));
+ } else {
+ if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+ !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+ vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+ vp9_check_segref(xd, 1, INTRA_FRAME) +
+ vp9_check_segref(xd, 1, LAST_FRAME) +
+ vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+ vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+ cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+ } else {
+ cpi->seg0_progress =
+ (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+ }
+
+ vp9_rd_pick_inter_mode_sb(cpi, x,
+ recon_yoffset,
+ recon_uvoffset,
+ totalrate,
+ totaldist);
+ }
+
+ /* Restore L & A coding context to those in place on entry */
+ vpx_memcpy (cm->left_context,
+ left_context,
+ sizeof(left_context));
+ vpx_memcpy (initial_above_context_ptr,
+ above_context,
+ sizeof(above_context));
+}
+#endif
+
+static void encode_sb(VP9_COMP *cpi,
+ VP9_COMMON *cm,
+ int mbrow,
+ int mbcol,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp) {
+ int i;
+ int map_index;
+ int mb_row, mb_col;
+ int recon_yoffset, recon_uvoffset;
+ int ref_fb_idx = cm->lst_fb_idx;
+ int dst_fb_idx = cm->new_fb_idx;
+ int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+ int row_delta[4] = { 0, +1, 0, -1};
+ int col_delta[4] = { +1, -1, +1, +1};
+
+ mb_row = mbrow;
+ mb_col = mbcol;
+
+ /* Encode MBs in raster order within the SB */
+ for (i = 0; i < 4; i++) {
+ int dy = row_delta[i];
+ int dx = col_delta[i];
+ int offset_extended = dy * xd->mode_info_stride + dx;
+ int offset_unextended = dy * cm->mb_cols + dx;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+ if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+ // MB lies outside frame, move on
+ mb_row += dy;
+ mb_col += dx;
+
+ x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+ x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
+ x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
+
+ x->gf_active_ptr += offset_unextended;
+ x->partition_info += offset_extended;
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+ assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+ (xd->mode_info_context - cpi->common.mip));
+#endif
+ continue;
+ }
+
+ xd->mb_index = i;
+
+#ifdef ENC_DEBUG
+ enc_debug = (cpi->common.current_video_frame == 0 &&
+ mb_row == 0 && mb_col == 0);
+ mb_col_debug = mb_col;
+ mb_row_debug = mb_row;
+#endif
+
+ // Restore MB state to that when it was picked
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ update_state(cpi, x, &x->sb_context[i]);
+ cpi->sb_count++;
+ } else
+#endif
+ update_state(cpi, x, &x->mb_context[i]);
+
+ map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+ x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+ // reset above block coeffs
+ xd->above_context = cm->above_context + mb_col;
+ xd->left_context = cm->left_context + (i >> 1);
+
+ // Set up distance of MB to edge of the frame in 1/8th pel units
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ // Set up limit values for MV components to prevent them from
+ // extending beyond the UMV borders assuming 32x32 block size
+ x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+ (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+ x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+ (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+ } else {
+#endif
+ // Set up limit values for MV components to prevent them from
+ // extending beyond the UMV borders assuming 16x16 block size
+ x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+ x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+ (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+ x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+ (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+ }
+#endif
+
+ xd->up_available = (mb_row != 0);
+ xd->left_available = (mb_col != 0);
+
+ recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+ xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+ // Copy current MB to a work buffer
+ vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ vp9_activity_masking(cpi, x);
+
+ // Is segmentation enabled
+ if (xd->segmentation_enabled) {
+ vp9_mb_init_quantizer(cpi, x);
+ }
+
+ x->active_ptr = cpi->active_map + map_index;
+
+ cpi->update_context = 0;
+
+#if CONFIG_SUPERBLOCKS
+ if (!xd->mode_info_context->mbmi.encoded_as_sb)
+#endif
+ vp9_intra_prediction_down_copy(xd);
+
+ if (cm->frame_type == KEY_FRAME) {
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb)
+ vp9_encode_intra_super_block(cpi, x, tp, mb_col);
+ else
+#endif
+ vp9_encode_intra_macro_block(cpi, x, tp, 1);
+ // Note the encoder may have changed the segment_id
+
+#ifdef MODE_STATS
+ y_modes[mbmi->mode]++;
+#endif
+ } else {
+ unsigned char *segment_id;
+ int seg_ref_active;
+
+ if (xd->mode_info_context->mbmi.ref_frame) {
+ unsigned char pred_context;
+
+ pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
+
+ if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+ cpi->single_pred_count[pred_context]++;
+ else
+ cpi->comp_pred_count[pred_context]++;
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb)
+ vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
+ mb_col, mb_row);
+ else
+#endif
+ vp9_encode_inter_macroblock(cpi, x, tp,
+ recon_yoffset, recon_uvoffset, 1);
+ // Note the encoder may have changed the segment_id
+
+#ifdef MODE_STATS
+ inter_y_modes[mbmi->mode]++;
+
+ if (mbmi->mode == SPLITMV) {
+ int b;
+
+ for (b = 0; b < x->partition_info->count; b++) {
+ inter_b_modes[x->partition_info->bmi[b].mode]++;
+ }
+ }
+
+#endif
+
+ // If we have just a single reference frame coded for a segment then
+ // exclude from the reference frame counts used to work out
+ // probabilities. NOTE: At the moment we dont support custom trees
+ // for the reference frame coding for each segment but this is a
+ // possible future action.
+ segment_id = &mbmi->segment_id;
+ seg_ref_active = vp9_segfeature_active(xd, *segment_id,
+ SEG_LVL_REF_FRAME);
+ if (!seg_ref_active ||
+ ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
+ vp9_check_segref(xd, *segment_id, LAST_FRAME) +
+ vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
+ vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
+ {
+ cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
+ }
+ }
+
+ // Count of last ref frame 0,0 usage
+ if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+ cpi->inter_zz_count++;
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ x->src.y_buffer += 32;
+ x->src.u_buffer += 16;
+ x->src.v_buffer += 16;
+
+ x->gf_active_ptr += 2;
+ x->partition_info += 2;
+ xd->mode_info_context += 2;
+ xd->prev_mode_info_context += 2;
+
+ (*tp)->Token = EOSB_TOKEN;
+ (*tp)++;
+ if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
+ break;
+ }
+#endif
+
+ // Next MB
+ mb_row += dy;
+ mb_col += dx;
+
+ x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
+ x->src.u_buffer += 8 * (dx + dy * x->src.uv_stride);
+ x->src.v_buffer += 8 * (dx + dy * x->src.uv_stride);
+
+ x->gf_active_ptr += offset_unextended;
+ x->partition_info += offset_extended;
+ xd->mode_info_context += offset_extended;
+ xd->prev_mode_info_context += offset_extended;
+
+#if CONFIG_DEBUG
+ assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+ (xd->mode_info_context - cpi->common.mip));
+#endif
+ (*tp)->Token = EOSB_TOKEN;
+ (*tp)++;
+ if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
+ }
+
+ // debug output
+#if DBG_PRNT_SEGMAP
+ {
+ FILE *statsfile;
+ statsfile = fopen("segmap2.stt", "a");
+ fprintf(statsfile, "\n");
+ fclose(statsfile);
+ }
+#endif
+}
+
+static
+void encode_sb_row(VP9_COMP *cpi,
+ VP9_COMMON *cm,
+ int mb_row,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp,
+ int *totalrate) {
+ int mb_col;
+ int mb_cols = cm->mb_cols;
+
+ // Initialize the left context for the new SB row
+ vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
+
+ // Code each SB in the row
+ for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
+ int mb_rate = 0, mb_dist = 0;
+#if CONFIG_SUPERBLOCKS
+ int sb_rate = INT_MAX, sb_dist;
+#endif
+
+#if CONFIG_DEBUG
+ MODE_INFO *mic = xd->mode_info_context;
+ PARTITION_INFO *pi = x->partition_info;
+ signed char *gfa = x->gf_active_ptr;
+ unsigned char *yb = x->src.y_buffer;
+ unsigned char *ub = x->src.u_buffer;
+ unsigned char *vb = x->src.v_buffer;
+#endif
+
+#if CONFIG_SUPERBLOCKS
+ // Pick modes assuming the SB is coded as 4 independent MBs
+ xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+ pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+ mb_rate += vp9_cost_bit(cm->sb_coded, 0);
+#endif
+
+ x->src.y_buffer -= 32;
+ x->src.u_buffer -= 16;
+ x->src.v_buffer -= 16;
+
+ x->gf_active_ptr -= 2;
+ x->partition_info -= 2;
+ xd->mode_info_context -= 2;
+ xd->prev_mode_info_context -= 2;
+
+#if CONFIG_DEBUG
+ assert(x->gf_active_ptr == gfa);
+ assert(x->partition_info == pi);
+ assert(xd->mode_info_context == mic);
+ assert(x->src.y_buffer == yb);
+ assert(x->src.u_buffer == ub);
+ assert(x->src.v_buffer == vb);
+#endif
+
+#if CONFIG_SUPERBLOCKS
+ if (!((( mb_cols & 1) && mb_col == mb_cols - 1) ||
+ ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+ /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+ xd->mode_info_context->mbmi.encoded_as_sb = 1;
+ pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+ sb_rate += vp9_cost_bit(cm->sb_coded, 1);
+ }
+
+ /* Decide whether to encode as a SB or 4xMBs */
+ if (sb_rate < INT_MAX &&
+ RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+ RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+ xd->mode_info_context->mbmi.encoded_as_sb = 1;
+ xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+ xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+ xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+ *totalrate += sb_rate;
+ } else
+#endif
+ {
+#if CONFIG_SUPERBLOCKS
+ xd->mode_info_context->mbmi.encoded_as_sb = 0;
+ if (cm->mb_cols - 1 > mb_col)
+ xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+ if (cm->mb_rows - 1 > mb_row) {
+ xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+ if (cm->mb_cols - 1 > mb_col)
+ xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+ }
+#endif
+ *totalrate += mb_rate;
+ }
+
+ /* Encode SB using best computed mode(s) */
+ encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
+
+#if CONFIG_DEBUG
+ assert(x->gf_active_ptr == gfa + 2);
+ assert(x->partition_info == pi + 2);
+ assert(xd->mode_info_context == mic + 2);
+ assert(x->src.y_buffer == yb + 32);
+ assert(x->src.u_buffer == ub + 16);
+ assert(x->src.v_buffer == vb + 16);
+#endif
+ }
+
+ // this is to account for the border
+ x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
+ x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+ xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+ xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+
+#if CONFIG_DEBUG
+ assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
+ (xd->mode_info_context - cpi->common.mip));
+#endif
+}
+
+static void init_encode_frame_mb_context(VP9_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // GF active flags data structure
+ x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+ // Activity map pointer
+ x->mb_activity_ptr = cpi->mb_activity_map;
+
+ x->act_zbin_adj = 0;
+ cpi->seg0_idx = 0;
+ vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
+
+ x->partition_info = x->pi;
+
+ xd->mode_info_context = cm->mi;
+ xd->mode_info_stride = cm->mode_info_stride;
+ xd->prev_mode_info_context = cm->prev_mi;
+
+ xd->frame_type = cm->frame_type;
+
+ xd->frames_since_golden = cm->frames_since_golden;
+ xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+
+ // reset intra mode contexts
+ if (cm->frame_type == KEY_FRAME)
+ vp9_init_mbmode_probs(cm);
+
+ // Copy data over into macro block data structures.
+ x->src = * cpi->Source;
+ xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+ xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+ // set up frame for intra coded blocks
+ vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+ vp9_build_block_offsets(x);
+
+ vp9_setup_block_dptrs(&x->e_mbd);
+
+ vp9_setup_block_ptrs(x);
+
+ xd->mode_info_context->mbmi.mode = DC_PRED;
+ xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+ vp9_zero(cpi->count_mb_ref_frame_usage)
+ vp9_zero(cpi->bmode_count)
+ vp9_zero(cpi->ymode_count)
+ vp9_zero(cpi->i8x8_mode_count)
+ vp9_zero(cpi->y_uv_mode_count)
+ vp9_zero(cpi->sub_mv_ref_count)
+ vp9_zero(cpi->mbsplit_count)
+ vp9_zero(cpi->common.fc.mv_ref_ct)
+ vp9_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+ vp9_zero(cpi->sb_ymode_count)
+ cpi->sb_count = 0;
+#endif
+
+ vpx_memset(cm->above_context, 0,
+ sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+
+ xd->fullpixel_mask = 0xffffffff;
+ if (cm->full_pixel)
+ xd->fullpixel_mask = 0xfffffff8;
+}
+
+static void encode_frame_internal(VP9_COMP *cpi) {
+ int mb_row;
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ TOKENEXTRA *tp = cpi->tok;
+ int totalrate;
+
+ //printf("encode_frame_internal\n");
+
+ // Compute a modified set of reference frame probabilities to use when
+ // prediction fails. These are based on the current general estimates for
+ // this frame which may be updated with each iteration of the recode loop.
+ vp9_compute_mod_refprobs(cm);
+
+#if CONFIG_NEW_MVREF
+ // temp stats reset
+ vp9_zero( cpi->best_ref_index_counts );
+#endif
+
+// debug output
+#if DBG_PRNT_SEGMAP
+ {
+ FILE *statsfile;
+ statsfile = fopen("segmap2.stt", "a");
+ fprintf(statsfile, "\n");
+ fclose(statsfile);
+ }
+#endif
+
+ totalrate = 0;
+
+ // Functions setup for all frame types so we can use MC in AltRef
+ vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
+
+ // Reset frame count of inter 0,0 motion vector usage.
+ cpi->inter_zz_count = 0;
+
+ cpi->prediction_error = 0;
+ cpi->intra_error = 0;
+ cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
+ cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
+
+#if CONFIG_PRED_FILTER
+ if (cm->current_video_frame == 0) {
+ // Initially assume that we'll signal the prediction filter
+ // state at the frame level and that it is off.
+ cpi->common.pred_filter_mode = 0;
+ cpi->common.prob_pred_filter_off = 128;
+ }
+ cpi->pred_filter_on_count = 0;
+ cpi->pred_filter_off_count = 0;
+#endif
+ vp9_zero(cpi->switchable_interp_count);
+
+ xd->mode_info_context = cm->mi;
+ xd->prev_mode_info_context = cm->prev_mi;
+
+ vp9_zero(cpi->NMVcount);
+ vp9_zero(cpi->coef_counts);
+ vp9_zero(cpi->hybrid_coef_counts);
+ vp9_zero(cpi->coef_counts_8x8);
+ vp9_zero(cpi->hybrid_coef_counts_8x8);
+ vp9_zero(cpi->coef_counts_16x16);
+ vp9_zero(cpi->hybrid_coef_counts_16x16);
+
+ vp9_frame_init_quantizer(cpi);
+
+ vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+ vp9_initialize_me_consts(cpi, cm->base_qindex);
+
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+ // Initialize encode frame context.
+ init_encode_frame_mb_context(cpi);
+
+ // Build a frame level activity map
+ build_activity_map(cpi);
+ }
+
+ // re-initencode frame context.
+ init_encode_frame_mb_context(cpi);
+
+ vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
+ vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
+ vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
+ vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+ vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
+ vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
+ {
+ struct vpx_usec_timer emr_timer;
+ vpx_usec_timer_start(&emr_timer);
+
+ {
+ // For each row of SBs in the frame
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+ int offset = (cm->mb_cols + 1) & ~0x1;
+
+ encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
+
+ // adjust to the next row of SBs
+ x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
+ x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
+ x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
+ }
+
+ cpi->tok_count = tp - cpi->tok;
+ }
+
+ vpx_usec_timer_mark(&emr_timer);
+ cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+
+ }
+
+ // 256 rate units to the bit,
+ // projected_frame_size in units of BYTES
+ cpi->projected_frame_size = totalrate >> 8;
+
+
+#if 0
+ // Keep record of the total distortion this time around for future use
+ cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+
+}
+
+static int check_dual_ref_flags(VP9_COMP *cpi) {
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ int ref_flags = cpi->ref_frame_flags;
+
+ if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+ if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
+ vp9_check_segref(xd, 1, LAST_FRAME))
+ return 1;
+ if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
+ vp9_check_segref(xd, 1, GOLDEN_FRAME))
+ return 1;
+ if ((ref_flags & (VP9_ALT_FLAG | VP9_LAST_FLAG)) == (VP9_ALT_FLAG | VP9_LAST_FLAG) &&
+ vp9_check_segref(xd, 1, ALTREF_FRAME))
+ return 1;
+ return 0;
+ } else {
+ return (!!(ref_flags & VP9_GOLD_FLAG) +
+ !!(ref_flags & VP9_LAST_FLAG) +
+ !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+ }
+}
+
+static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
+ VP9_COMMON *cm = &cpi->common;
+ int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
+ MODE_INFO *mi, *mi_ptr = cm->mi;
+#if CONFIG_SUPERBLOCKS
+ MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
+ MB_MODE_INFO *sb_mbmi;
+#endif
+ MB_MODE_INFO *mbmi;
+ MACROBLOCK *x = &cpi->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
+ mi = mi_ptr;
+#if CONFIG_SUPERBLOCKS
+ sb_mi = sb_mi_ptr;
+#endif
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
+ mbmi = &mi->mbmi;
+#if CONFIG_SUPERBLOCKS
+ sb_mbmi = &sb_mi->mbmi;
+#endif
+ if (
+#if CONFIG_SUPERBLOCKS
+ !sb_mbmi->encoded_as_sb &&
+#endif
+ mbmi->txfm_size > txfm_max) {
+ segment_id = mbmi->segment_id;
+ xd->mode_info_context = mi;
+ assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+ (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+ mbmi->txfm_size = txfm_max;
+ }
+#if CONFIG_SUPERBLOCKS
+ if (mb_col & 1)
+ sb_mi += 2;
+#endif
+ }
+#if CONFIG_SUPERBLOCKS
+ if (mb_row & 1)
+ sb_mi_ptr += 2 * mis;
+#endif
+ }
+}
+
+void vp9_encode_frame(VP9_COMP *cpi) {
+ if (cpi->sf.RD) {
+ int i, frame_type, pred_type;
+ TXFM_MODE txfm_type;
+
+ /*
+ * This code does a single RD pass over the whole frame assuming
+ * either compound, single or hybrid prediction as per whatever has
+ * worked best for that type of frame in the past.
+ * It also predicts whether another coding mode would have worked
+ * better that this coding mode. If that is the case, it remembers
+ * that for subsequent frames.
+ * It does the same analysis for transform size selection also.
+ */
+ if (cpi->common.frame_type == KEY_FRAME)
+ frame_type = 0;
+ else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
+ frame_type = 3;
+ else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+ frame_type = 1;
+ else
+ frame_type = 2;
+
+ /* prediction (compound, single or hybrid) mode selection */
+ if (frame_type == 3)
+ pred_type = SINGLE_PREDICTION_ONLY;
+ else if (cpi->rd_prediction_type_threshes[frame_type][1] >
+ cpi->rd_prediction_type_threshes[frame_type][0] &&
+ cpi->rd_prediction_type_threshes[frame_type][1] >
+ cpi->rd_prediction_type_threshes[frame_type][2] &&
+ check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ pred_type = COMP_PREDICTION_ONLY;
+ else if (cpi->rd_prediction_type_threshes[frame_type][0] >
+ cpi->rd_prediction_type_threshes[frame_type][2])
+ pred_type = SINGLE_PREDICTION_ONLY;
+ else
+ pred_type = HYBRID_PREDICTION;
+
+ /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
+#if CONFIG_LOSSLESS
+ if (cpi->oxcf.lossless) {
+ txfm_type = ONLY_4X4;
+ } else
+#endif
+ /* FIXME (rbultje)
+ * this is a hack (no really), basically to work around the complete
+ * nonsense coefficient cost prediction for keyframes. The probabilities
+ * are reset to defaults, and thus we basically have no idea how expensive
+ * a 4x4 vs. 8x8 will really be. The result is that any estimate at which
+ * of the two is better is utterly bogus.
+ * I'd like to eventually remove this hack, but in order to do that, we
+ * need to move the frame reset code from the frame encode init to the
+ * bitstream write code, or alternatively keep a backup of the previous
+ * keyframe's probabilities as an estimate of what the current keyframe's
+ * coefficient cost distributions may look like. */
+ if (frame_type == 0) {
+ txfm_type = ALLOW_16X16;
+ } else
+#if 0
+ /* FIXME (rbultje)
+ * this code is disabled for a similar reason as the code above; the
+ * problem is that each time we "revert" to 4x4 only (or even 8x8 only),
+ * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,
+ * thus leading to them lagging further behind and not being chosen for
+ * subsequent frames either. This is essentially a local minimum problem
+ * that we can probably fix by estimating real costs more closely within
+ * a frame, perhaps by re-calculating costs on-the-fly as frame encoding
+ * progresses. */
+ if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
+ cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
+ cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+ txfm_type = TX_MODE_SELECT;
+ } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
+ && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
+ ) {
+ txfm_type = ONLY_4X4;
+ } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+ txfm_type = ALLOW_16X16;
+ } else
+ txfm_type = ALLOW_8X8;
+#else
+ txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+ cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+ ALLOW_16X16 : TX_MODE_SELECT;
+#endif
+ cpi->common.txfm_mode = txfm_type;
+ if (txfm_type != TX_MODE_SELECT) {
+ cpi->common.prob_tx[0] = 128;
+ cpi->common.prob_tx[1] = 128;
+ }
+ cpi->common.comp_pred_mode = pred_type;
+ encode_frame_internal(cpi);
+
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;
+ cpi->rd_prediction_type_threshes[frame_type][i] += diff;
+ cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
+ }
+
+ for (i = 0; i < NB_TXFM_MODES; ++i) {
+ int64_t pd = cpi->rd_tx_select_diff[i];
+ int diff;
+ if (i == TX_MODE_SELECT)
+ pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+ diff = pd / cpi->common.MBs;
+ cpi->rd_tx_select_threshes[frame_type][i] += diff;
+ cpi->rd_tx_select_threshes[frame_type][i] /= 2;
+ }
+
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ int single_count_zero = 0;
+ int comp_count_zero = 0;
+
+ for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
+ single_count_zero += cpi->single_pred_count[i];
+ comp_count_zero += cpi->comp_pred_count[i];
+ }
+
+ if (comp_count_zero == 0) {
+ cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+ } else if (single_count_zero == 0) {
+ cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+ }
+ }
+
+ if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+ const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
+ const int count8x8 = cpi->txfm_count[TX_8X8];
+ const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
+ const int count16x16 = cpi->txfm_count[TX_16X16];
+
+ if (count4x4 == 0 && count16x16 == 0) {
+ cpi->common.txfm_mode = ALLOW_8X8;
+ reset_skip_txfm_size(cpi, TX_8X8);
+ } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
+ cpi->common.txfm_mode = ONLY_4X4;
+ reset_skip_txfm_size(cpi, TX_4X4);
+ } else if (count8x8 == 0 && count4x4 == 0) {
+ cpi->common.txfm_mode = ALLOW_16X16;
+ }
+ }
+ } else {
+ encode_frame_internal(cpi);
+ }
+
+}
+
+void vp9_setup_block_ptrs(MACROBLOCK *x) {
+ int r, c;
+ int i;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++) {
+ for (c = 0; c < 2; c++) {
+ x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+ }
+ }
+
+
+ for (r = 0; r < 2; r++) {
+ for (c = 0; c < 2; c++) {
+ x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+ }
+ }
+
+ x->block[24].src_diff = x->src_diff + 384;
+
+
+ for (i = 0; i < 25; i++) {
+ x->block[i].coeff = x->coeff + i * 16;
+ }
+}
+
+void vp9_build_block_offsets(MACROBLOCK *x) {
+ int block = 0;
+ int br, bc;
+
+ vp9_build_block_doffsets(&x->e_mbd);
+
+ // y blocks
+ x->thismb_ptr = &x->thismb[0];
+ for (br = 0; br < 4; br++) {
+ for (bc = 0; bc < 4; bc++) {
+ BLOCK *this_block = &x->block[block];
+ // this_block->base_src = &x->src.y_buffer;
+ // this_block->src_stride = x->src.y_stride;
+ // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ this_block->base_src = &x->thismb_ptr;
+ this_block->src_stride = 16;
+ this_block->src = 4 * br * 16 + 4 * bc;
+ ++block;
+ }
+ }
+
+ // u blocks
+ for (br = 0; br < 2; br++) {
+ for (bc = 0; bc < 2; bc++) {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.u_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+
+ // v blocks
+ for (br = 0; br < 2; br++) {
+ for (bc = 0; bc < 2; bc++) {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.v_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+}
+
+static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
+ const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+ const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+ ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+ ++ uv_modes_y[m][uvm];
+
+ if (m == B_PRED) {
+ unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+ int b = 0;
+
+ do {
+ ++ bct[xd->block[b].bmi.as_mode.first];
+ } while (++b < 16);
+ }
+
+ if (m == I8X8_PRED) {
+ i8x8_modes[xd->block[0].bmi.as_mode.first]++;
+ i8x8_modes[xd->block[2].bmi.as_mode.first]++;
+ i8x8_modes[xd->block[8].bmi.as_mode.first]++;
+ i8x8_modes[xd->block[10].bmi.as_mode.first]++;
+ }
+#endif
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ ++cpi->sb_ymode_count[m];
+ } else
+#endif
+ ++cpi->ymode_count[m];
+ if (m != I8X8_PRED)
+ ++cpi->y_uv_mode_count[m][uvm];
+ else {
+ cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
+ cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
+ cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
+ cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
+ }
+ if (m == B_PRED) {
+ int b = 0;
+ do {
+ ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];
+ } while (++b < 16);
+ }
+}
+
+// Experimental stub function to create a per MB zbin adjustment based on
+// some previously calculated measure of MB activity.
+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
+#if USE_ACT_INDEX
+ x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
+ int64_t a;
+ int64_t b;
+ int64_t act = *(x->mb_activity_ptr);
+
+ // Apply the masking to the RD multiplier.
+ a = act + 4 * cpi->activity_avg;
+ b = 4 * act + cpi->activity_avg;
+
+ if (act > cpi->activity_avg)
+ x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+ else
+ x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+#endif
+}
+
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ ENTROPY_CONTEXT_PLANES ta[4],
+ ENTROPY_CONTEXT_PLANES tl[4],
+ TOKENEXTRA *t[4],
+ TOKENEXTRA **tp,
+ int skip[4])
+{
+ TOKENEXTRA tokens[4][16 * 24];
+ int n_tokens[4], n;
+
+ // if there were no skips, we don't need to do anything
+ if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+ return;
+
+ // if we don't do coeff skipping for this frame, we don't
+ // need to do anything here
+ if (!cpi->common.mb_no_coeff_skip)
+ return;
+
+ // if all 4 MBs skipped coeff coding, nothing to be done
+ if (skip[0] && skip[1] && skip[2] && skip[3])
+ return;
+
+ // so the situation now is that we want to skip coeffs
+ // for some MBs, but not all, and we didn't code EOB
+ // coefficients for them. However, the skip flag for this
+ // SB will be 0 overall, so we need to insert EOBs in the
+ // middle of the token tree. Do so here.
+ n_tokens[0] = t[1] - t[0];
+ n_tokens[1] = t[2] - t[1];
+ n_tokens[2] = t[3] - t[2];
+ n_tokens[3] = *tp - t[3];
+ if (n_tokens[0])
+ memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+ if (n_tokens[1])
+ memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+ if (n_tokens[2])
+ memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+ if (n_tokens[3])
+ memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+ // reset pointer, stuff EOBs where necessary
+ *tp = t[0];
+ for (n = 0; n < 4; n++) {
+ if (skip[n]) {
+ x->e_mbd.above_context = &ta[n];
+ x->e_mbd.left_context = &tl[n];
+ vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
+ } else {
+ if (n_tokens[n]) {
+ memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+ }
+ (*tp) += n_tokens[n];
+ }
+ }
+}
+
+void vp9_encode_intra_super_block(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ TOKENEXTRA **t,
+ int mb_col) {
+ const int output_enabled = 1;
+ int n;
+ MACROBLOCKD *xd = &x->e_mbd;
+ VP9_COMMON *cm = &cpi->common;
+ const uint8_t *src = x->src.y_buffer;
+ uint8_t *dst = xd->dst.y_buffer;
+ const uint8_t *usrc = x->src.u_buffer;
+ uint8_t *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer;
+ uint8_t *vdst = xd->dst.v_buffer;
+ int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+ int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+ const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+ TOKENEXTRA *tp[4];
+ int skip[4];
+ MODE_INFO *mi = x->e_mbd.mode_info_context;
+ ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+ if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+ adjust_act_zbin(cpi, x);
+ vp9_update_zbin_extra(cpi, x);
+ }
+
+ vp9_build_intra_predictors_sby_s(&x->e_mbd);
+ vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+
+ assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+ for (n = 0; n < 4; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+
+ xd->above_context = cm->above_context + mb_col + (n & 1);
+ xd->left_context = cm->left_context + (n >> 1);
+
+ vp9_subtract_mby_s_c(x->src_diff,
+ src + x_idx * 16 + y_idx * 16 * src_y_stride,
+ src_y_stride,
+ dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+ dst_y_stride);
+ vp9_subtract_mbuv_s_c(x->src_diff,
+ usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ src_uv_stride,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ dst_uv_stride);
+ vp9_transform_mb_8x8(x);
+ vp9_quantize_mb_8x8(x);
+ if (x->optimize) {
+ vp9_optimize_mby_8x8(x, rtcd);
+ vp9_optimize_mbuv_8x8(x, rtcd);
+ }
+ vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+ vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+ vp9_recon_mbuv_s_c(&x->e_mbd,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+ if (output_enabled) {
+ memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+ memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+ tp[n] = *t;
+ xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+ vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+ skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+ }
+ }
+
+ if (output_enabled) {
+ // Tokenize
+ xd->mode_info_context = mi;
+ sum_intra_stats(cpi, x);
+ update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+ }
+}
+#endif /* CONFIG_SUPERBLOCKS */
+
+void vp9_encode_intra_macro_block(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ TOKENEXTRA **t,
+ int output_enabled) {
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+ adjust_act_zbin(cpi, x);
+ vp9_update_zbin_extra(cpi, x);
+ }
+ if (mbmi->mode == I8X8_PRED) {
+ vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
+ vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
+ } else if (mbmi->mode == B_PRED) {
+ vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+ } else {
+ vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ }
+
+ if (mbmi->mode != I8X8_PRED) {
+ vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+ }
+
+ if (output_enabled) {
+ int segment_id = mbmi->segment_id;
+
+ // Tokenize
+ sum_intra_stats(cpi, x);
+ vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+
+ if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+ !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+ (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+ vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+ if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {
+ cpi->txfm_count[mbmi->txfm_size]++;
+ } else if (mbmi->mode == I8X8_PRED) {
+ cpi->txfm_count_8x8p[mbmi->txfm_size]++;
+ }
+ } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {
+ mbmi->txfm_size = TX_16X16;
+ } else
+ if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
+ }
+#if CONFIG_NEWBESTREFMV
+ else
+ vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);
+#endif
+}
+
+extern void vp9_fix_contexts(MACROBLOCKD *xd);
+
+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t, int recon_yoffset,
+ int recon_uvoffset, int output_enabled) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+ unsigned char *segment_id = &mbmi->segment_id;
+ int seg_ref_active;
+ unsigned char ref_pred_flag;
+
+ x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+ assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
+
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+ // Adjust the zbin based on this MB rate.
+ adjust_act_zbin(cpi, x);
+ }
+
+ {
+ // Experimental code. Special case for gf and arf zeromv modes.
+ // Increase zbin size to suppress noise
+ cpi->zbin_mode_boost = 0;
+ if (cpi->zbin_mode_boost_enabled) {
+ if (mbmi->ref_frame != INTRA_FRAME) {
+ if (mbmi->mode == ZEROMV) {
+ if (mbmi->ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ } else if (mbmi->mode == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
+ }
+
+ vp9_update_zbin_extra(cpi, x);
+ }
+
+ seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);
+
+ // SET VARIOUS PREDICTION FLAGS
+
+ // Did the chosen reference frame match its predicted value.
+ ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));
+ vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+ if (mbmi->ref_frame == INTRA_FRAME) {
+ if (mbmi->mode == B_PRED) {
+ vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+ vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+ } else if (mbmi->mode == I8X8_PRED) {
+ vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
+ vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
+ } else {
+ vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+ vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ }
+
+ if (output_enabled)
+ sum_intra_stats(cpi, x);
+ } else {
+ int ref_fb_idx;
+
+ if (mbmi->ref_frame == LAST_FRAME)
+ ref_fb_idx = cpi->common.lst_fb_idx;
+ else if (mbmi->ref_frame == GOLDEN_FRAME)
+ ref_fb_idx = cpi->common.gld_fb_idx;
+ else
+ ref_fb_idx = cpi->common.alt_fb_idx;
+
+ xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+ if (mbmi->second_ref_frame) {
+ int second_ref_fb_idx;
+
+ if (mbmi->second_ref_frame == LAST_FRAME)
+ second_ref_fb_idx = cpi->common.lst_fb_idx;
+ else if (mbmi->second_ref_frame == GOLDEN_FRAME)
+ second_ref_fb_idx = cpi->common.gld_fb_idx;
+ else
+ second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+ xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+ recon_yoffset;
+ xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+ recon_uvoffset;
+ xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+ recon_uvoffset;
+ }
+
+ if (!x->skip) {
+ vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+
+ // Clear mb_skip_coeff if mb_no_coeff_skip is not set
+ if (!cpi->common.mb_no_coeff_skip)
+ mbmi->mb_skip_coeff = 0;
+
+ } else {
+ vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride,
+ xd->dst.uv_stride);
+ }
+ }
+
+ if (!x->skip) {
+#ifdef ENC_DEBUG
+ if (enc_debug) {
+ int i;
+ printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,
+ mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);
+ for (i = 0; i < 400; i++) {
+ printf("%3d ", xd->qcoeff[i]);
+ if (i % 16 == 15) printf("\n");
+ }
+ printf("\n");
+ printf("eobs = ");
+ for (i = 0; i < 25; i++)
+ printf("%d:%d ", i, xd->block[i].eob);
+ printf("\n");
+ fflush(stdout);
+ }
+#endif
+
+ vp9_tokenize_mb(cpi, xd, t, !output_enabled);
+
+#ifdef ENC_DEBUG
+ if (enc_debug) {
+ printf("Tokenized\n");
+ fflush(stdout);
+ }
+#endif
+ } else {
+ int mb_skip_context =
+ cpi->common.mb_no_coeff_skip ?
+ (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+ (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+ 0;
+ if (cpi->common.mb_no_coeff_skip) {
+ mbmi->mb_skip_coeff = 1;
+ if (output_enabled)
+ cpi->skip_true_count[mb_skip_context]++;
+ vp9_fix_contexts(xd);
+ } else {
+ vp9_stuff_mb(cpi, xd, t, !output_enabled);
+ mbmi->mb_skip_coeff = 0;
+ if (output_enabled)
+ cpi->skip_false_count[mb_skip_context]++;
+ }
+ }
+
+ if (output_enabled) {
+ int segment_id = mbmi->segment_id;
+ if (cpi->common.txfm_mode == TX_MODE_SELECT &&
+ !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
+ (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
+ vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+ if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+ mbmi->mode != SPLITMV) {
+ cpi->txfm_count[mbmi->txfm_size]++;
+ } else if (mbmi->mode == I8X8_PRED ||
+ (mbmi->mode == SPLITMV &&
+ mbmi->partitioning != PARTITIONING_4X4)) {
+ cpi->txfm_count_8x8p[mbmi->txfm_size]++;
+ }
+ } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
+ mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (mbmi->mode != B_PRED &&
+ !(mbmi->mode == SPLITMV &&
+ mbmi->partitioning == PARTITIONING_4X4) &&
+ cpi->common.txfm_mode >= ALLOW_8X8) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
+ }
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset,
+ int mb_col, int mb_row) {
+ const int output_enabled = 1;
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const uint8_t *src = x->src.y_buffer;
+ uint8_t *dst = xd->dst.y_buffer;
+ const uint8_t *usrc = x->src.u_buffer;
+ uint8_t *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer;
+ uint8_t *vdst = xd->dst.v_buffer;
+ int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+ int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+ const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+ unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+ int seg_ref_active;
+ unsigned char ref_pred_flag;
+ int n;
+ TOKENEXTRA *tp[4];
+ int skip[4];
+ MODE_INFO *mi = x->e_mbd.mode_info_context;
+ ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+ x->skip = 0;
+
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+ // Adjust the zbin based on this MB rate.
+ adjust_act_zbin(cpi, x);
+ }
+
+ {
+ // Experimental code. Special case for gf and arf zeromv modes.
+ // Increase zbin size to suppress noise
+ cpi->zbin_mode_boost = 0;
+ if (cpi->zbin_mode_boost_enabled) {
+ if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+ if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
+ }
+
+ vp9_update_zbin_extra(cpi, x);
+ }
+
+ seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+ // SET VARIOUS PREDICTION FLAGS
+
+ // Did the chosen reference frame match its predicted value.
+ ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+ vp9_get_pred_ref(cm, xd)));
+ vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+ vp9_build_intra_predictors_sby_s(&x->e_mbd);
+ vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+ } else {
+ int ref_fb_idx;
+
+ if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+ ref_fb_idx = cpi->common.lst_fb_idx;
+ else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+ ref_fb_idx = cpi->common.gld_fb_idx;
+ else
+ ref_fb_idx = cpi->common.alt_fb_idx;
+
+ xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+ if (xd->mode_info_context->mbmi.second_ref_frame) {
+ int second_ref_fb_idx;
+
+ if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+ second_ref_fb_idx = cpi->common.lst_fb_idx;
+ else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+ second_ref_fb_idx = cpi->common.gld_fb_idx;
+ else
+ second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+ xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+ recon_yoffset;
+ xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+ recon_uvoffset;
+ xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+ recon_uvoffset;
+ }
+
+ vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ }
+
+ assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+ for (n = 0; n < 4; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+
+ vp9_subtract_mby_s_c(x->src_diff,
+ src + x_idx * 16 + y_idx * 16 * src_y_stride,
+ src_y_stride,
+ dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+ dst_y_stride);
+ vp9_subtract_mbuv_s_c(x->src_diff,
+ usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ src_uv_stride,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ dst_uv_stride);
+ vp9_transform_mb_8x8(x);
+ vp9_quantize_mb_8x8(x);
+ if (x->optimize) {
+ vp9_optimize_mby_8x8(x, rtcd);
+ vp9_optimize_mbuv_8x8(x, rtcd);
+ }
+ vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+ vp9_recon_mby_s_c(&x->e_mbd,
+ dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+ vp9_recon_mbuv_s_c(&x->e_mbd,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+ if (!x->skip) {
+ if (output_enabled) {
+ xd->left_context = cm->left_context + (n >> 1);
+ xd->above_context = cm->above_context + mb_col + (n & 1);
+ memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+ memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+ tp[n] = *t;
+ xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+ vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+ skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+ }
+ } else {
+ int mb_skip_context =
+ cpi->common.mb_no_coeff_skip ?
+ (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+ (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+ 0;
+ if (cpi->common.mb_no_coeff_skip) {
+ skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+ xd->left_context = cm->left_context + (n >> 1);
+ xd->above_context = cm->above_context + mb_col + (n & 1);
+ memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+ memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+ tp[n] = *t;
+ cpi->skip_true_count[mb_skip_context]++;
+ vp9_fix_contexts(xd);
+ } else {
+ vp9_stuff_mb(cpi, xd, t, 0);
+ xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+ cpi->skip_false_count[mb_skip_context]++;
+ }
+ }
+ }
+
+ xd->mode_info_context = mi;
+ update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/encodeintra.c
@@ -1,0 +1,289 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "vpx_rtcd.h"
+#include "vp9/common/idct.h"
+#include "quantize.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "encodemb.h"
+#include "vp9/common/invtrans.h"
+#include "encodeintra.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
+ int i;
+ int intra_pred_var = 0;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ (void) cpi;
+
+ if (use_16x16_pred) {
+ mbmi->mode = DC_PRED;
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame = INTRA_FRAME;
+
+ vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+ } else {
+ for (i = 0; i < 16; i++) {
+ x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
+ vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
+ }
+ }
+
+ intra_pred_var = vp9_get_mb_ss(x->src_diff);
+
+ return intra_pred_var;
+}
+
+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x, int ib) {
+ BLOCKD *b = &x->e_mbd.block[ib];
+ BLOCK *be = &x->block[ib];
+ TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+ if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+ vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
+ b->predictor);
+ }
+#endif
+
+ vp9_subtract_b(be, b, 16);
+
+ tx_type = get_tx_type(&x->e_mbd, b);
+ if (tx_type != DCT_DCT) {
+ vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
+ vp9_ht_quantize_b_4x4(be, b, tx_type);
+ vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);
+ } else {
+ x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b_4x4(be, b) ;
+ vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+ }
+
+ vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ vp9_encode_intra4x4block(rtcd, mb, i);
+ return;
+}
+
+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+ TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+ TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+ if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))
+#endif
+ vp9_build_intra_predictors_mby(xd);
+#if CONFIG_COMP_INTRA_PRED
+ else
+ vp9_build_comp_intra_predictors_mby(xd);
+#endif
+
+ vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
+
+ if (tx_size == TX_16X16) {
+ BLOCKD *bd = &xd->block[0];
+ tx_type = get_tx_type(xd, bd);
+ if (tx_type != DCT_DCT) {
+ vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);
+ vp9_quantize_mby_16x16(x);
+ if (x->optimize)
+ vp9_optimize_mby_16x16(x, rtcd);
+ vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
+ } else {
+ vp9_transform_mby_16x16(x);
+ vp9_quantize_mby_16x16(x);
+ if (x->optimize)
+ vp9_optimize_mby_16x16(x, rtcd);
+ vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);
+ }
+ } else if (tx_size == TX_8X8) {
+ vp9_transform_mby_8x8(x);
+ vp9_quantize_mby_8x8(x);
+ if (x->optimize)
+ vp9_optimize_mby_8x8(x, rtcd);
+ vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
+ } else {
+ vp9_transform_mby_4x4(x);
+ vp9_quantize_mby_4x4(x);
+ if (x->optimize)
+ vp9_optimize_mby_4x4(x, rtcd);
+ vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+ }
+
+ vp9_recon_mby(xd);
+}
+
+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+
+#if CONFIG_COMP_INTRA_PRED
+ if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+ vp9_build_intra_predictors_mbuv(xd);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_build_comp_intra_predictors_mbuv(xd);
+ }
+#endif
+
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ xd->predictor, x->src.uv_stride);
+
+ if (tx_size == TX_4X4) {
+ vp9_transform_mbuv_4x4(x);
+ vp9_quantize_mbuv_4x4(x);
+ if (x->optimize)
+ vp9_optimize_mbuv_4x4(x, rtcd);
+ vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+ } else /* 16x16 or 8x8 */ {
+ vp9_transform_mbuv_8x8(x);
+ vp9_quantize_mbuv_8x8(x);
+ if (x->optimize)
+ vp9_optimize_mbuv_8x8(x, rtcd);
+ vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);
+ }
+
+ vp9_recon_intra_mbuv(xd);
+}
+
+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x, int ib) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ BLOCKD *b = &xd->block[ib];
+ BLOCK *be = &x->block[ib];
+ const int iblock[4] = {0, 1, 4, 5};
+ int i;
+ TX_TYPE tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+ if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+ vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
+ b->predictor);
+ }
+#endif
+
+ if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+ int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+ // generate residual blocks
+ vp9_subtract_4b_c(be, b, 16);
+
+ tx_type = get_tx_type(xd, xd->block + idx);
+ if (tx_type != DCT_DCT) {
+ vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,
+ tx_type, 8);
+ x->quantize_b_8x8(x->block + idx, xd->block + idx);
+ vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+ tx_type, 8);
+ } else {
+ x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+ x->quantize_b_8x8(x->block + idx, xd->block + idx);
+ vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ b = &xd->block[ib + iblock[i]];
+ be = &x->block[ib + iblock[i]];
+ vp9_subtract_b(be, b, 16);
+ x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b_4x4(be, b);
+ vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+ }
+ }
+
+ // reconstruct submacroblock
+ for (i = 0; i < 4; i++) {
+ b = &xd->block[ib + iblock[i]];
+ vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
+ b->dst_stride);
+ }
+}
+
+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ int i, ib;
+
+ for (i = 0; i < 4; i++) {
+ ib = vp9_i8x8_block[i];
+ vp9_encode_intra8x8(rtcd, x, ib);
+ }
+}
+
+void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x, int ib,
+ int mode, int second) {
+ BLOCKD *b = &x->e_mbd.block[ib];
+ BLOCK *be = &x->block[ib];
+
+#if CONFIG_COMP_INTRA_PRED
+ if (second == -1) {
+#endif
+ vp9_intra_uv4x4_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);
+ }
+#endif
+
+ vp9_subtract_b(be, b, 8);
+
+ x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
+ x->quantize_b_4x4(be, b);
+ vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);
+
+ vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
+ b->dst_stride);
+}
+
+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ int i, ib, mode, second;
+ BLOCKD *b;
+
+ for (i = 0; i < 4; i++) {
+ ib = vp9_i8x8_block[i];
+ b = &x->e_mbd.block[ib];
+ mode = b->bmi.as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ second = b->bmi.as_mode.second;
+#else
+ second = -1;
+#endif
+ /*u */
+ vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);
+ /*v */
+ vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);
+ }
+}
--- /dev/null
+++ b/vp9/encoder/encodeintra.h
@@ -1,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __ENCODEINTRA_H_
+#define __ENCODEINTRA_H_
+
+#include "onyx_int.h"
+
+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);
+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x, int ib);
+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
+ MACROBLOCK *x, int ib);
+
+#endif // __ENCODEINTRA_H_
--- /dev/null
+++ b/vp9/encoder/encodemb.c
@@ -1,0 +1,950 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "vp9/common/reconinter.h"
+#include "quantize.h"
+#include "tokenize.h"
+#include "vp9/common/invtrans.h"
+#include "vp9/common/reconintra.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_rtcd.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
+
+ int r, c;
+
+ for (r = 0; r < 4; r++) {
+ for (c = 0; c < 4; c++) {
+ diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+ }
+
+ diff_ptr += pitch;
+ pred_ptr += pitch;
+ src_ptr += src_stride;
+ }
+}
+
+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
+ int r, c;
+
+ for (r = 0; r < 8; r++) {
+ for (c = 0; c < 8; c++) {
+ diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+ }
+ diff_ptr += pitch;
+ pred_ptr += pitch;
+ src_ptr += src_stride;
+ }
+}
+
+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+ const unsigned char *vsrc, int src_stride,
+ const unsigned char *upred,
+ const unsigned char *vpred, int dst_stride) {
+ short *udiff = diff + 256;
+ short *vdiff = diff + 320;
+ int r, c;
+
+ for (r = 0; r < 8; r++) {
+ for (c = 0; c < 8; c++) {
+ udiff[c] = usrc[c] - upred[c];
+ }
+
+ udiff += 8;
+ upred += dst_stride;
+ usrc += src_stride;
+ }
+
+ for (r = 0; r < 8; r++) {
+ for (c = 0; c < 8; c++) {
+ vdiff[c] = vsrc[c] - vpred[c];
+ }
+
+ vdiff += 8;
+ vpred += dst_stride;
+ vsrc += src_stride;
+ }
+}
+
+void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,
+ unsigned char *vsrc, unsigned char *pred, int stride) {
+ unsigned char *upred = pred + 256;
+ unsigned char *vpred = pred + 320;
+
+ vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
+ const unsigned char *pred, int dst_stride) {
+ int r, c;
+
+ for (r = 0; r < 16; r++) {
+ for (c = 0; c < 16; c++) {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += 16;
+ pred += dst_stride;
+ src += src_stride;
+ }
+}
+
+void vp9_subtract_mby_c(short *diff, unsigned char *src,
+ unsigned char *pred, int stride) {
+ vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
+}
+
+static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ BLOCK *b = &x->block[0];
+
+ vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
+ b->src_stride);
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+}
+
+static void build_dcblock_4x4(MACROBLOCK *x) {
+ short *src_diff_ptr = &x->src_diff[384];
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ src_diff_ptr[i] = x->coeff[i * 16];
+ }
+}
+
+void vp9_transform_mby_4x4(MACROBLOCK *x) {
+ int i;
+
+ for (i = 0; i < 16; i += 2) {
+ x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
+ }
+
+ if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+ // build dc block from 16 y dc values
+ build_dcblock_4x4(x);
+
+ // do 2nd order transform on the dc block
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
+ }
+}
+
+void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
+ int i;
+
+ for (i = 16; i < 24; i += 2) {
+ x->vp9_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
+ }
+}
+
+static void transform_mb_4x4(MACROBLOCK *x) {
+ vp9_transform_mby_4x4(x);
+ vp9_transform_mbuv_4x4(x);
+}
+
+static void build_dcblock_8x8(MACROBLOCK *x) {
+ int16_t *src_diff_ptr = x->block[24].src_diff;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ src_diff_ptr[i] = 0;
+ }
+ src_diff_ptr[0] = x->coeff[0 * 16];
+ src_diff_ptr[1] = x->coeff[4 * 16];
+ src_diff_ptr[4] = x->coeff[8 * 16];
+ src_diff_ptr[8] = x->coeff[12 * 16];
+}
+
+void vp9_transform_mby_8x8(MACROBLOCK *x) {
+ int i;
+
+ for (i = 0; i < 9; i += 8) {
+ x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
+ }
+ for (i = 2; i < 11; i += 8) {
+ x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+ &x->block[i + 2].coeff[0], 32);
+ }
+
+ if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+ // build dc block from 2x2 y dc values
+ build_dcblock_8x8(x);
+
+ // do 2nd order transform on the dc block
+ x->short_fhaar2x2(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
+ }
+}
+
+void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
+ int i;
+
+ for (i = 16; i < 24; i += 4) {
+ x->vp9_short_fdct8x8(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
+ }
+}
+
+void vp9_transform_mb_8x8(MACROBLOCK *x) {
+ vp9_transform_mby_8x8(x);
+ vp9_transform_mbuv_8x8(x);
+}
+
+void vp9_transform_mby_16x16(MACROBLOCK *x) {
+ vp9_clear_system_state();
+ x->vp9_short_fdct16x16(&x->block[0].src_diff[0],
+ &x->block[0].coeff[0], 32);
+}
+
+void vp9_transform_mb_16x16(MACROBLOCK *x) {
+ vp9_transform_mby_16x16(x);
+ vp9_transform_mbuv_8x8(x);
+}
+
+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+typedef struct vp9_token_state vp9_token_state;
+
+struct vp9_token_state {
+ int rate;
+ int error;
+ int next;
+ signed char token;
+ short qc;
+};
+
+// TODO: experiments to find optimal multiple numbers
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 4
+
+static const int plane_rd_mult[4] = {
+ Y1_RD_MULT,
+ Y2_RD_MULT,
+ UV_RD_MULT,
+ Y1_RD_MULT
+};
+
+#define UPDATE_RD_COST()\
+{\
+ rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
+ rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
+ if (rd_cost0 == rd_cost1) {\
+ rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
+ rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
+ }\
+}
+
+static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ const VP9_ENCODER_RTCD *rtcd, int tx_size) {
+ BLOCK *b;
+ BLOCKD *d;
+ vp9_token_state tokens[65][2];
+ uint64_t best_mask[2];
+ const short *dequant_ptr;
+ const short *coeff_ptr;
+ short *qcoeff_ptr;
+ short *dqcoeff_ptr;
+ int eob;
+ int i0;
+ int rc;
+ int x;
+ int sz = 0;
+ int next;
+ int rdmult;
+ int rddiv;
+ int final_eob;
+ int64_t rd_cost0, rd_cost1;
+ int rate0, rate1;
+ int error0, error1;
+ int t0, t1;
+ int best;
+ int band;
+ int pt;
+ int err_mult = plane_rd_mult[type];
+ int default_eob;
+ int const *scan, *bands;
+
+ b = &mb->block[i];
+ d = &mb->e_mbd.block[i];
+ switch (tx_size) {
+ default:
+ case TX_4X4:
+ scan = vp9_default_zig_zag1d;
+ bands = vp9_coef_bands;
+ default_eob = 16;
+ // TODO: this isn't called (for intra4x4 modes), but will be left in
+ // since it could be used later
+ {
+ TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);
+ if (tx_type != DCT_DCT) {
+ switch (tx_type) {
+ case ADST_DCT:
+ scan = vp9_row_scan;
+ break;
+
+ case DCT_ADST:
+ scan = vp9_col_scan;
+ break;
+
+ default:
+ scan = vp9_default_zig_zag1d;
+ break;
+ }
+ } else {
+ scan = vp9_default_zig_zag1d;
+ }
+ }
+ break;
+ case TX_8X8:
+ scan = vp9_default_zig_zag1d_8x8;
+ bands = vp9_coef_bands_8x8;
+ default_eob = 64;
+ break;
+ }
+
+ dequant_ptr = d->dequant;
+ coeff_ptr = b->coeff;
+ qcoeff_ptr = d->qcoeff;
+ dqcoeff_ptr = d->dqcoeff;
+ i0 = (type == PLANE_TYPE_Y_NO_DC);
+ eob = d->eob;
+
+ /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+ rdmult = mb->rdmult * err_mult;
+ if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ rdmult = (rdmult * 9) >> 4;
+ rddiv = mb->rddiv;
+ best_mask[0] = best_mask[1] = 0;
+ /* Initialize the sentinel node of the trellis. */
+ tokens[eob][0].rate = 0;
+ tokens[eob][0].error = 0;
+ tokens[eob][0].next = default_eob;
+ tokens[eob][0].token = DCT_EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ *(tokens[eob] + 1) = *(tokens[eob] + 0);
+ next = eob;
+ for (i = eob; i-- > i0;) {
+ int base_bits;
+ int d2;
+ int dx;
+
+ rc = scan[i];
+ x = qcoeff_ptr[rc];
+ /* Only add a trellis state for non-zero coefficients. */
+ if (x) {
+ int shortcut = 0;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ /* Evaluate the first possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ t0 = (vp9_dct_value_tokens_ptr + x)->Token;
+ /* Consider both possible successor states. */
+ if (next < default_eob) {
+ band = bands[i + 1];
+ pt = vp9_prev_token_class[t0];
+ rate0 +=
+ mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
+ rate1 +=
+ mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];
+ }
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp9_dct_value_cost_ptr + x);
+ dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+ d2 = dx * dx;
+ tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][0].error = d2 + (best ? error1 : error0);
+ tokens[i][0].next = next;
+ tokens[i][0].token = t0;
+ tokens[i][0].qc = x;
+ best_mask[0] |= best << i;
+ /* Evaluate the second possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+
+ if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&
+ (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))
+ shortcut = 1;
+ else
+ shortcut = 0;
+
+ if (shortcut) {
+ sz = -(x < 0);
+ x -= 2 * sz + 1;
+ }
+
+ /* Consider both possible successor states. */
+ if (!x) {
+ /* If we reduced this coefficient to zero, check to see if
+ * we need to move the EOB back here.
+ */
+ t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ } else {
+ t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
+ }
+ if (next < default_eob) {
+ band = bands[i + 1];
+ if (t0 != DCT_EOB_TOKEN) {
+ pt = vp9_prev_token_class[t0];
+ rate0 += mb->token_costs[tx_size][type][band][pt][
+ tokens[next][0].token];
+ }
+ if (t1 != DCT_EOB_TOKEN) {
+ pt = vp9_prev_token_class[t1];
+ rate1 += mb->token_costs[tx_size][type][band][pt][
+ tokens[next][1].token];
+ }
+ }
+
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp9_dct_value_cost_ptr + x);
+
+ if (shortcut) {
+ dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+ d2 = dx * dx;
+ }
+ tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][1].error = d2 + (best ? error1 : error0);
+ tokens[i][1].next = next;
+ tokens[i][1].token = best ? t1 : t0;
+ tokens[i][1].qc = x;
+ best_mask[1] |= best << i;
+ /* Finally, make this the new head of the trellis. */
+ next = i;
+ }
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
+ else {
+ band = bands[i + 1];
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ /* Update the cost of each path if we're past the EOB token. */
+ if (t0 != DCT_EOB_TOKEN) {
+ tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];
+ tokens[next][0].token = ZERO_TOKEN;
+ }
+ if (t1 != DCT_EOB_TOKEN) {
+ tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];
+ tokens[next][1].token = ZERO_TOKEN;
+ }
+ /* Don't update next, because we didn't add a new node. */
+ }
+ }
+
+ /* Now pick the best path through the whole trellis. */
+ band = bands[i + 1];
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ rate0 += mb->token_costs[tx_size][type][band][pt][t0];
+ rate1 += mb->token_costs[tx_size][type][band][pt][t1];
+ UPDATE_RD_COST();
+ best = rd_cost1 < rd_cost0;
+ final_eob = i0 - 1;
+ for (i = next; i < eob; i = next) {
+ x = tokens[i][best].qc;
+ if (x)
+ final_eob = i;
+ rc = scan[i];
+ qcoeff_ptr[rc] = x;
+ dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
+
+ next = tokens[i][best].next;
+ best = (best_mask[best] >> i) & 1;
+ }
+ final_eob++;
+
+ d->eob = final_eob;
+ *a = *l = (d->eob != !type);
+}
+
+/**************************************************************************
+our inverse hadamard transform effectively is weighted sum of all 16 inputs
+with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And
+dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the
+output after inverse wht and idct will be all zero. A sum of absolute value
+smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht
+fall between -65 and +65.
+**************************************************************************/
+#define SUM_2ND_COEFF_THRESH 65
+
+static void check_reset_2nd_coeffs(MACROBLOCKD *xd,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+ int sum = 0;
+ int i;
+ BLOCKD *bd = &xd->block[24];
+ if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH
+ && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)
+ return;
+
+ for (i = 0; i < bd->eob; i++) {
+ int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];
+ sum += (coef >= 0) ? coef : -coef;
+ if (sum >= SUM_2ND_COEFF_THRESH)
+ return;
+ }
+
+ if (sum < SUM_2ND_COEFF_THRESH) {
+ for (i = 0; i < bd->eob; i++) {
+ int rc = vp9_default_zig_zag1d[i];
+ bd->qcoeff[rc] = 0;
+ bd->dqcoeff[rc] = 0;
+ }
+ bd->eob = 0;
+ *a = *l = (bd->eob != 0);
+ }
+}
+
+#define SUM_2ND_COEFF_THRESH_8X8 32
+static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+ int sum = 0;
+ BLOCKD *bd = &xd->block[24];
+ int coef;
+
+ coef = bd->dqcoeff[0];
+ sum += (coef >= 0) ? coef : -coef;
+ coef = bd->dqcoeff[1];
+ sum += (coef >= 0) ? coef : -coef;
+ coef = bd->dqcoeff[4];
+ sum += (coef >= 0) ? coef : -coef;
+ coef = bd->dqcoeff[8];
+ sum += (coef >= 0) ? coef : -coef;
+
+ if (sum < SUM_2ND_COEFF_THRESH_8X8) {
+ bd->qcoeff[0] = 0;
+ bd->dqcoeff[0] = 0;
+ bd->qcoeff[1] = 0;
+ bd->dqcoeff[1] = 0;
+ bd->qcoeff[4] = 0;
+ bd->dqcoeff[4] = 0;
+ bd->qcoeff[8] = 0;
+ bd->dqcoeff[8] = 0;
+ bd->eob = 0;
+ *a = *l = (bd->eob != 0);
+ }
+}
+
+void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ int b;
+ PLANE_TYPE type;
+ int has_2nd_order;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+ if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);
+ type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+ for (b = 0; b < 16; b++) {
+ optimize_b(x, b, type,
+ ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+ }
+
+ if (has_2nd_order) {
+ b = 24;
+ optimize_b(x, b, PLANE_TYPE_Y2,
+ ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+ check_reset_2nd_coeffs(&x->e_mbd,
+ ta + vp9_block2above[b], tl + vp9_block2left[b]);
+ }
+}
+
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ int b;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 16; b < 24; b++) {
+ optimize_b(x, b, PLANE_TYPE_UV,
+ ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+ }
+}
+
+static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ vp9_optimize_mby_4x4(x, rtcd);
+ vp9_optimize_mbuv_4x4(x, rtcd);
+}
+
+void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ int b;
+ PLANE_TYPE type;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+ if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+ for (b = 0; b < 16; b += 4) {
+ optimize_b(x, b, type,
+ ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+ rtcd, TX_8X8);
+ ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
+ tl[vp9_block2left_8x8[b] + 1] = tl[vp9_block2left_8x8[b]];
+ }
+
+ // 8x8 always have 2nd roder haar block
+ if (has_2nd_order) {
+ check_reset_8x8_2nd_coeffs(&x->e_mbd,
+ ta + vp9_block2above_8x8[24],
+ tl + vp9_block2left_8x8[24]);
+ }
+}
+
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ int b;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 16; b < 24; b += 4) {
+ optimize_b(x, b, PLANE_TYPE_UV,
+ ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+ rtcd, TX_8X8);
+ ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
+ tl[vp9_block2left_8x8[b] + 1] = tl[vp9_block2left_8x8[b]];
+ }
+}
+
+static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ vp9_optimize_mby_8x8(x, rtcd);
+ vp9_optimize_mbuv_8x8(x, rtcd);
+}
+
+static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ const VP9_ENCODER_RTCD *rtcd) {
+ BLOCK *b = &mb->block[i];
+ BLOCKD *d = &mb->e_mbd.block[i];
+ vp9_token_state tokens[257][2];
+ unsigned best_index[257][2];
+ const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
+ short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
+ int eob = d->eob, final_eob, sz = 0;
+ int rc, x, next;
+ int64_t rdmult, rddiv, rd_cost0, rd_cost1;
+ int rate0, rate1, error0, error1, t0, t1;
+ int best, band, pt;
+ int err_mult = plane_rd_mult[type];
+
+ /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+ rdmult = mb->rdmult * err_mult;
+ if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ rdmult = (rdmult * 9)>>4;
+ rddiv = mb->rddiv;
+ memset(best_index, 0, sizeof(best_index));
+ /* Initialize the sentinel node of the trellis. */
+ tokens[eob][0].rate = 0;
+ tokens[eob][0].error = 0;
+ tokens[eob][0].next = 256;
+ tokens[eob][0].token = DCT_EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ *(tokens[eob] + 1) = *(tokens[eob] + 0);
+ next = eob;
+ for (i = eob; i-- > 0;) {
+ int base_bits, d2, dx;
+
+ rc = vp9_default_zig_zag1d_16x16[i];
+ x = qcoeff_ptr[rc];
+ /* Only add a trellis state for non-zero coefficients. */
+ if (x) {
+ int shortcut = 0;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ /* Evaluate the first possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ t0 = (vp9_dct_value_tokens_ptr + x)->Token;
+ /* Consider both possible successor states. */
+ if (next < 256) {
+ band = vp9_coef_bands_16x16[i + 1];
+ pt = vp9_prev_token_class[t0];
+ rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];
+ rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];
+ }
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp9_dct_value_cost_ptr + x);
+ dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+ d2 = dx*dx;
+ tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][0].error = d2 + (best ? error1 : error0);
+ tokens[i][0].next = next;
+ tokens[i][0].token = t0;
+ tokens[i][0].qc = x;
+ best_index[i][0] = best;
+ /* Evaluate the second possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+
+ if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
+ (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
+ shortcut = 1;
+ else
+ shortcut = 0;
+
+ if (shortcut) {
+ sz = -(x < 0);
+ x -= 2*sz + 1;
+ }
+
+ /* Consider both possible successor states. */
+ if (!x) {
+ /* If we reduced this coefficient to zero, check to see if
+ * we need to move the EOB back here.
+ */
+ t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ }
+ else
+ t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;
+ if (next < 256) {
+ band = vp9_coef_bands_16x16[i + 1];
+ if (t0 != DCT_EOB_TOKEN) {
+ pt = vp9_prev_token_class[t0];
+ rate0 += mb->token_costs[TX_16X16][type][band][pt]
+ [tokens[next][0].token];
+ }
+ if (t1!=DCT_EOB_TOKEN) {
+ pt = vp9_prev_token_class[t1];
+ rate1 += mb->token_costs[TX_16X16][type][band][pt]
+ [tokens[next][1].token];
+ }
+ }
+ UPDATE_RD_COST();
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp9_dct_value_cost_ptr + x);
+
+ if(shortcut) {
+ dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
+ d2 = dx*dx;
+ }
+ tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][1].error = d2 + (best ? error1 : error0);
+ tokens[i][1].next = next;
+ tokens[i][1].token = best ? t1 : t0;
+ tokens[i][1].qc = x;
+ best_index[i][1] = best;
+ /* Finally, make this the new head of the trellis. */
+ next = i;
+ }
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
+ else {
+ band = vp9_coef_bands_16x16[i + 1];
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ /* Update the cost of each path if we're past the EOB token. */
+ if (t0 != DCT_EOB_TOKEN) {
+ tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];
+ tokens[next][0].token = ZERO_TOKEN;
+ }
+ if (t1 != DCT_EOB_TOKEN) {
+ tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];
+ tokens[next][1].token = ZERO_TOKEN;
+ }
+ /* Don't update next, because we didn't add a new node. */
+ }
+ }
+
+ /* Now pick the best path through the whole trellis. */
+ band = vp9_coef_bands_16x16[i + 1];
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];
+ rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];
+ UPDATE_RD_COST();
+ best = rd_cost1 < rd_cost0;
+ final_eob = -1;
+
+ for (i = next; i < eob; i = next) {
+ x = tokens[i][best].qc;
+ if (x)
+ final_eob = i;
+ rc = vp9_default_zig_zag1d_16x16[i];
+ qcoeff_ptr[rc] = x;
+ dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
+
+ next = tokens[i][best].next;
+ best = best_index[i][best];
+ }
+ final_eob++;
+
+ d->eob = final_eob;
+ *a = *l = (d->eob != !type);
+}
+
+void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+
+ if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
+}
+
+static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+ vp9_optimize_mby_16x16(x, rtcd);
+ vp9_optimize_mbuv_8x8(x, rtcd);
+}
+
+void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+
+ vp9_build_inter_predictors_mb(xd);
+ subtract_mb(rtcd, x);
+
+ if (tx_size == TX_16X16) {
+ vp9_transform_mb_16x16(x);
+ vp9_quantize_mb_16x16(x);
+ if (x->optimize)
+ optimize_mb_16x16(x, rtcd);
+ vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);
+ } else if (tx_size == TX_8X8) {
+ if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+ assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
+ vp9_transform_mby_8x8(x);
+ vp9_transform_mbuv_4x4(x);
+ vp9_quantize_mby_8x8(x);
+ vp9_quantize_mbuv_4x4(x);
+ if (x->optimize) {
+ vp9_optimize_mby_8x8(x, rtcd);
+ vp9_optimize_mbuv_4x4(x, rtcd);
+ }
+ vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
+ vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+ } else {
+ vp9_transform_mb_8x8(x);
+ vp9_quantize_mb_8x8(x);
+ if (x->optimize)
+ optimize_mb_8x8(x, rtcd);
+ vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);
+ }
+ } else {
+ transform_mb_4x4(x);
+ vp9_quantize_mb_4x4(x);
+ if (x->optimize)
+ optimize_mb_4x4(x, rtcd);
+ vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
+ }
+
+ vp9_recon_mb(xd);
+}
+
+/* this function is used by first pass only */
+void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+
+#if CONFIG_PRED_FILTER
+ // Disable the prediction filter for firstpass
+ xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+ vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+
+ vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
+
+ vp9_transform_mby_4x4(x);
+ vp9_quantize_mby_4x4(x);
+ vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+
+ vp9_recon_mby(xd);
+}
--- /dev/null
+++ b/vp9/encoder/encodemb.h
@@ -1,0 +1,70 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "vpx_ports/config.h"
+#include "block.h"
+
+typedef struct {
+ MB_PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame;
+ MV_REFERENCE_FRAME second_ref_frame;
+#if CONFIG_PRED_FILTER
+ int pred_filter_flag;
+#endif
+} MODE_DEFINITION;
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn
+#endif
+
+
+
+#include "onyx_int.h"
+struct VP9_ENCODER_RTCD;
+void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+void vp9_transform_mbuv_4x4(MACROBLOCK *x);
+void vp9_transform_mby_4x4(MACROBLOCK *x);
+
+void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+void vp9_transform_mb_8x8(MACROBLOCK *mb);
+void vp9_transform_mby_8x8(MACROBLOCK *x);
+void vp9_transform_mbuv_8x8(MACROBLOCK *x);
+void vp9_build_dcblock_8x8(MACROBLOCK *b);
+void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
+void vp9_transform_mb_16x16(MACROBLOCK *mb);
+void vp9_transform_mby_16x16(MACROBLOCK *x);
+void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
+
+#if CONFIG_SUPERBLOCKS
+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+ const unsigned char *vsrc, int src_stride,
+ const unsigned char *upred,
+ const unsigned char *vpred, int dst_stride);
+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
+ int src_stride, const unsigned char *pred,
+ int dst_stride);
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/encodemv.c
@@ -1,0 +1,547 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/common.h"
+#include "encodemv.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+#ifdef NMV_STATS
+nmv_context_counts tnmvcounts;
+#endif
+
+static void encode_nmv_component(vp9_writer* const bc,
+ int v,
+ int r,
+ const nmv_component* const mvcomp) {
+ int s, z, c, o, d;
+ assert (v != 0); /* should not be zero */
+ s = v < 0;
+ vp9_write(bc, s, mvcomp->sign);
+ z = (s ? -v : v) - 1; /* magnitude - 1 */
+
+ c = vp9_get_mv_class(z, &o);
+
+ write_token(bc, vp9_mv_class_tree, mvcomp->classes,
+ vp9_mv_class_encodings + c);
+
+ d = (o >> 3); /* int mv data */
+
+ if (c == MV_CLASS_0) {
+ write_token(bc, vp9_mv_class0_tree, mvcomp->class0,
+ vp9_mv_class0_encodings + d);
+ } else {
+ int i, b;
+ b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i)
+ vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
+ }
+}
+
+static void encode_nmv_component_fp(vp9_writer *bc,
+ int v,
+ int r,
+ const nmv_component* const mvcomp,
+ int usehp) {
+ int s, z, c, o, d, f, e;
+ assert (v != 0); /* should not be zero */
+ s = v < 0;
+ z = (s ? -v : v) - 1; /* magnitude - 1 */
+
+ c = vp9_get_mv_class(z, &o);
+
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+
+ /* Code the fractional pel bits */
+ if (c == MV_CLASS_0) {
+ write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],
+ vp9_mv_fp_encodings + f);
+ } else {
+ write_token(bc, vp9_mv_fp_tree, mvcomp->fp,
+ vp9_mv_fp_encodings + f);
+ }
+ /* Code the high precision bit */
+ if (usehp) {
+ if (c == MV_CLASS_0) {
+ vp9_write(bc, e, mvcomp->class0_hp);
+ } else {
+ vp9_write(bc, e, mvcomp->hp);
+ }
+ }
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+ const nmv_component* const mvcomp,
+ int usehp) {
+ int i, v;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
+ int class0_hp_cost[2], hp_cost[2];
+
+ sign_cost[0] = vp9_cost_zero(mvcomp->sign);
+ sign_cost[1] = vp9_cost_one(mvcomp->sign);
+ vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
+ vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
+ bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
+ }
+
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
+ vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
+
+ if (usehp) {
+ class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+ class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+ hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+ hp_cost[1] = vp9_cost_one(mvcomp->hp);
+ }
+ mvcost[0] = 0;
+ for (v = 1; v <= MV_MAX; ++v) {
+ int z, c, o, d, e, f, cost = 0;
+ z = v - 1;
+ c = vp9_get_mv_class(z, &o);
+ cost += class_cost[c];
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+ if (c == MV_CLASS_0) {
+ cost += class0_cost[d];
+ } else {
+ int i, b;
+ b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i)
+ cost += bits_cost[i][((d >> i) & 1)];
+ }
+ if (c == MV_CLASS_0) {
+ cost += class0_fp_cost[d][f];
+ } else {
+ cost += fp_cost[f];
+ }
+ if (usehp) {
+ if (c == MV_CLASS_0) {
+ cost += class0_hp_cost[e];
+ } else {
+ cost += hp_cost[e];
+ }
+ }
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ }
+}
+
+static int update_nmv_savings(const unsigned int ct[2],
+ const vp9_prob cur_p,
+ const vp9_prob new_p,
+ const vp9_prob upd_p) {
+
+#ifdef LOW_PRECISION_MV_UPDATE
+ vp9_prob mod_p = new_p | 1;
+#else
+ vp9_prob mod_p = new_p;
+#endif
+ const int cur_b = cost_branch256(ct, cur_p);
+ const int mod_b = cost_branch256(ct, mod_p);
+ const int cost = 7 * 256 +
+#ifndef LOW_PRECISION_MV_UPDATE
+ 256 +
+#endif
+ (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
+ if (cur_b - mod_b - cost > 0) {
+ return cur_b - mod_b - cost;
+ } else {
+ return -vp9_cost_zero(upd_p);
+ }
+}
+
+static int update_nmv(
+ vp9_writer *const bc,
+ const unsigned int ct[2],
+ vp9_prob *const cur_p,
+ const vp9_prob new_p,
+ const vp9_prob upd_p) {
+
+#ifdef LOW_PRECISION_MV_UPDATE
+ vp9_prob mod_p = new_p | 1;
+#else
+ vp9_prob mod_p = new_p;
+#endif
+
+ const int cur_b = cost_branch256(ct, *cur_p);
+ const int mod_b = cost_branch256(ct, mod_p);
+ const int cost = 7 * 256 +
+#ifndef LOW_PRECISION_MV_UPDATE
+ 256 +
+#endif
+ (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
+
+ if (cur_b - mod_b > cost) {
+ *cur_p = mod_p;
+ vp9_write(bc, 1, upd_p);
+#ifdef LOW_PRECISION_MV_UPDATE
+ vp9_write_literal(bc, mod_p >> 1, 7);
+#else
+ vp9_write_literal(bc, mod_p, 8);
+#endif
+ return 1;
+ } else {
+ vp9_write(bc, 0, upd_p);
+ return 0;
+ }
+}
+
+#ifdef NMV_STATS
+void init_nmvstats() {
+ vp9_zero(tnmvcounts);
+}
+
+void print_nmvstats() {
+ nmv_context prob;
+ unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+ unsigned int branch_ct_sign[2][2];
+ unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+ unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+ unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+ unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+ unsigned int branch_ct_fp[2][4 - 1][2];
+ unsigned int branch_ct_class0_hp[2][2];
+ unsigned int branch_ct_hp[2][2];
+ int i, j, k;
+ vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
+
+ printf("\nCounts =\n { ");
+ for (j = 0; j < MV_JOINTS; ++j)
+ printf("%d, ", tnmvcounts.joints[j]);
+ printf("},\n");
+ for (i=0; i< 2; ++i) {
+ printf(" {\n");
+ printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0],
+ tnmvcounts.comps[i].sign[1]);
+ printf(" { ");
+ for (j = 0; j < MV_CLASSES; ++j)
+ printf("%d, ", tnmvcounts.comps[i].classes[j]);
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ printf("%d, ", tnmvcounts.comps[i].class0[j]);
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
+ tnmvcounts.comps[i].bits[j][1]);
+ printf("},\n");
+
+ printf(" {");
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ printf("{");
+ for (k = 0; k < 4; ++k)
+ printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
+ printf("}, ");
+ }
+ printf("},\n");
+
+ printf(" { ");
+ for (j = 0; j < 4; ++j)
+ printf("%d, ", tnmvcounts.comps[i].fp[j]);
+ printf("},\n");
+
+ printf(" %d/%d,\n",
+ tnmvcounts.comps[i].class0_hp[0],
+ tnmvcounts.comps[i].class0_hp[1]);
+ printf(" %d/%d,\n",
+ tnmvcounts.comps[i].hp[0],
+ tnmvcounts.comps[i].hp[1]);
+ printf(" },\n");
+ }
+
+ printf("\nProbs =\n { ");
+ for (j = 0; j < MV_JOINTS - 1; ++j)
+ printf("%d, ", prob.joints[j]);
+ printf("},\n");
+ for (i=0; i< 2; ++i) {
+ printf(" {\n");
+ printf(" %d,\n", prob.comps[i].sign);
+ printf(" { ");
+ for (j = 0; j < MV_CLASSES - 1; ++j)
+ printf("%d, ", prob.comps[i].classes[j]);
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < CLASS0_SIZE - 1; ++j)
+ printf("%d, ", prob.comps[i].class0[j]);
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ printf("%d, ", prob.comps[i].bits[j]);
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ printf("{");
+ for (k = 0; k < 3; ++k)
+ printf("%d, ", prob.comps[i].class0_fp[j][k]);
+ printf("}, ");
+ }
+ printf("},\n");
+ printf(" { ");
+ for (j = 0; j < 3; ++j)
+ printf("%d, ", prob.comps[i].fp[j]);
+ printf("},\n");
+
+ printf(" %d,\n", prob.comps[i].class0_hp);
+ printf(" %d,\n", prob.comps[i].hp);
+ printf(" },\n");
+ }
+}
+
+static void add_nmvcount(nmv_context_counts* const dst,
+ const nmv_context_counts* const src) {
+ int i, j, k;
+ for (j = 0; j < MV_JOINTS; ++j) {
+ dst->joints[j] += src->joints[j];
+ }
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < MV_VALS; ++j) {
+ dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];
+ }
+ dst->comps[i].sign[0] += src->comps[i].sign[0];
+ dst->comps[i].sign[1] += src->comps[i].sign[1];
+ for (j = 0; j < MV_CLASSES; ++j) {
+ dst->comps[i].classes[j] += src->comps[i].classes[j];
+ }
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ dst->comps[i].class0[j] += src->comps[i].class0[j];
+ }
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];
+ dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ for (k = 0; k < 4; ++k) {
+ dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];
+ }
+ }
+ for (j = 0; j < 4; ++j) {
+ dst->comps[i].fp[j] += src->comps[i].fp[j];
+ }
+ dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];
+ dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];
+ dst->comps[i].hp[0] += src->comps[i].hp[0];
+ dst->comps[i].hp[1] += src->comps[i].hp[1];
+ }
+}
+#endif
+
+void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+ int i, j;
+ nmv_context prob;
+ unsigned int branch_ct_joint[MV_JOINTS - 1][2];
+ unsigned int branch_ct_sign[2][2];
+ unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
+ unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
+ unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
+ unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
+ unsigned int branch_ct_fp[2][4 - 1][2];
+ unsigned int branch_ct_class0_hp[2][2];
+ unsigned int branch_ct_hp[2][2];
+ int savings = 0;
+
+#ifdef NMV_STATS
+ if (!cpi->dummy_packing)
+ add_nmvcount(&tnmvcounts, &cpi->NMVcount);
+#endif
+ vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
+ /* write updates if they help */
+#ifdef MV_GROUP_UPDATE
+ for (j = 0; j < MV_JOINTS - 1; ++j) {
+ savings += update_nmv_savings(branch_ct_joint[j],
+ cpi->common.fc.nmvc.joints[j],
+ prob.joints[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (i = 0; i < 2; ++i) {
+ savings += update_nmv_savings(branch_ct_sign[i],
+ cpi->common.fc.nmvc.comps[i].sign,
+ prob.comps[i].sign,
+ VP9_NMV_UPDATE_PROB);
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ savings += update_nmv_savings(branch_ct_classes[i][j],
+ cpi->common.fc.nmvc.comps[i].classes[j],
+ prob.comps[i].classes[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ savings += update_nmv_savings(branch_ct_class0[i][j],
+ cpi->common.fc.nmvc.comps[i].class0[j],
+ prob.comps[i].class0[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ savings += update_nmv_savings(branch_ct_bits[i][j],
+ cpi->common.fc.nmvc.comps[i].bits[j],
+ prob.comps[i].bits[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ int k;
+ for (k = 0; k < 3; ++k) {
+ savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
+ cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
+ prob.comps[i].class0_fp[j][k],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ for (j = 0; j < 3; ++j) {
+ savings += update_nmv_savings(branch_ct_fp[i][j],
+ cpi->common.fc.nmvc.comps[i].fp[j],
+ prob.comps[i].fp[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ savings += update_nmv_savings(branch_ct_class0_hp[i],
+ cpi->common.fc.nmvc.comps[i].class0_hp,
+ prob.comps[i].class0_hp,
+ VP9_NMV_UPDATE_PROB);
+ savings += update_nmv_savings(branch_ct_hp[i],
+ cpi->common.fc.nmvc.comps[i].hp,
+ prob.comps[i].hp,
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ if (savings <= 0) {
+ vp9_write_bit(bc, 0);
+ return;
+ }
+ vp9_write_bit(bc, 1);
+#endif
+
+ for (j = 0; j < MV_JOINTS - 1; ++j) {
+ update_nmv(bc, branch_ct_joint[j],
+ &cpi->common.fc.nmvc.joints[j],
+ prob.joints[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (i = 0; i < 2; ++i) {
+ update_nmv(bc, branch_ct_sign[i],
+ &cpi->common.fc.nmvc.comps[i].sign,
+ prob.comps[i].sign,
+ VP9_NMV_UPDATE_PROB);
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ update_nmv(bc, branch_ct_classes[i][j],
+ &cpi->common.fc.nmvc.comps[i].classes[j],
+ prob.comps[i].classes[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ update_nmv(bc, branch_ct_class0[i][j],
+ &cpi->common.fc.nmvc.comps[i].class0[j],
+ prob.comps[i].class0[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ update_nmv(bc, branch_ct_bits[i][j],
+ &cpi->common.fc.nmvc.comps[i].bits[j],
+ prob.comps[i].bits[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ int k;
+ for (k = 0; k < 3; ++k) {
+ update_nmv(bc, branch_ct_class0_fp[i][j][k],
+ &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
+ prob.comps[i].class0_fp[j][k],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ for (j = 0; j < 3; ++j) {
+ update_nmv(bc, branch_ct_fp[i][j],
+ &cpi->common.fc.nmvc.comps[i].fp[j],
+ prob.comps[i].fp[j],
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ update_nmv(bc, branch_ct_class0_hp[i],
+ &cpi->common.fc.nmvc.comps[i].class0_hp,
+ prob.comps[i].class0_hp,
+ VP9_NMV_UPDATE_PROB);
+ update_nmv(bc, branch_ct_hp[i],
+ &cpi->common.fc.nmvc.comps[i].hp,
+ prob.comps[i].hp,
+ VP9_NMV_UPDATE_PROB);
+ }
+ }
+}
+
+void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,
+ const MV* const ref, const nmv_context* const mvctx) {
+ MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+ write_token(bc, vp9_mv_joint_tree, mvctx->joints,
+ vp9_mv_joint_encodings + j);
+ if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+ encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
+ }
+ if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+ encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
+ }
+}
+
+void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,
+ const MV* const ref, const nmv_context* const mvctx,
+ int usehp) {
+ MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+ usehp = usehp && vp9_use_nmv_hp(ref);
+ if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+ encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
+ }
+ if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+ encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
+ }
+}
+
+void vp9_build_nmv_cost_table(int *mvjoint,
+ int *mvcost[2],
+ const nmv_context* const mvctx,
+ int usehp,
+ int mvc_flag_v,
+ int mvc_flag_h) {
+ vp9_clear_system_state();
+ vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
+ if (mvc_flag_v)
+ build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
+ if (mvc_flag_h)
+ build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+}
--- /dev/null
+++ b/vp9/encoder/encodemv.h
@@ -1,0 +1,30 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);
+void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,
+ const MV* const ref, const nmv_context* const mvctx);
+void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,
+ const MV* const ref, const nmv_context *mvctx,
+ int usehp);
+void vp9_build_nmv_cost_table(int *mvjoint,
+ int *mvcost[2],
+ const nmv_context *mvctx,
+ int usehp,
+ int mvc_flag_v,
+ int mvc_flag_h);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/firstpass.c
@@ -1,0 +1,2533 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "math.h"
+#include "limits.h"
+#include "block.h"
+#include "onyx_int.h"
+#include "variance.h"
+#include "encodeintra.h"
+#include "vp9/common/setupintrarecon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "vp9/common/extend.h"
+#include "vp9/common/systemdependent.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include <stdio.h>
+#include "rdopt.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "vp9/common/entropymv.h"
+#include "encodemv.h"
+
+#define OUTPUT_FPF 0
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void vp9_build_block_offsets(MACROBLOCK *x);
+
+extern void vp9_setup_block_ptrs(MACROBLOCK *x);
+
+extern void vp9_frame_init_quantizer(VP9_COMP *cpi);
+
+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,
+ int_mv *mv);
+
+extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
+
+#define IIFACTOR 12.5
+#define IIKFACTOR1 12.5
+#define IIKFACTOR2 15.0
+#define RMAX 128.0
+#define GF_RMAX 96.0
+#define ERR_DIVISOR 150.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
+
+static int select_cq_level(int qindex) {
+ int ret_val = QINDEX_RANGE - 1;
+ int i;
+
+ double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ if (target_q <= vp9_convert_qindex_to_q(i)) {
+ ret_val = i;
+ break;
+ }
+ }
+
+ return ret_val;
+}
+
+
+// Resets the first pass file to the given position using a relative seek from the current position
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
+ cpi->twopass.stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+ return EOF;
+
+ *next_frame = *cpi->twopass.stats_in;
+ return 1;
+}
+
+// Read frame stats at an offset from the current position
+static int read_frame_stats(VP9_COMP *cpi,
+ FIRSTPASS_STATS *frame_stats,
+ int offset) {
+ FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;
+
+ // Check legality of offset
+ if (offset >= 0) {
+ if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)
+ return EOF;
+ } else if (offset < 0) {
+ if (&fps_ptr[offset] < cpi->twopass.stats_in_start)
+ return EOF;
+ }
+
+ *frame_stats = fps_ptr[offset];
+ return 1;
+}
+
+static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+ return EOF;
+
+ *fps = *cpi->twopass.stats_in;
+ cpi->twopass.stats_in =
+ (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+ return 1;
+}
+
+static void output_stats(const VP9_COMP *cpi,
+ struct vpx_codec_pkt_list *pktlist,
+ FIRSTPASS_STATS *stats) {
+ struct vpx_codec_cx_pkt pkt;
+ pkt.kind = VPX_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+ "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+ "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+ stats->frame,
+ stats->intra_error,
+ stats->coded_error,
+ stats->sr_coded_error,
+ stats->ssim_weighted_pred_err,
+ stats->pcnt_inter,
+ stats->pcnt_motion,
+ stats->pcnt_second_ref,
+ stats->pcnt_neutral,
+ stats->MVr,
+ stats->mvr_abs,
+ stats->MVc,
+ stats->mvc_abs,
+ stats->MVrv,
+ stats->MVcv,
+ stats->mv_in_out_count,
+ stats->new_mv_count,
+ stats->count,
+ stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->intra_error = 0.0;
+ section->coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->ssim_weighted_pred_err = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->intra_error += frame->intra_error;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->intra_error -= frame->intra_error;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+static void avg_stats(FIRSTPASS_STATS *section) {
+ if (section->count < 1.0)
+ return;
+
+ section->intra_error /= section->count;
+ section->coded_error /= section->count;
+ section->sr_coded_error /= section->count;
+ section->ssim_weighted_pred_err /= section->count;
+ section->pcnt_inter /= section->count;
+ section->pcnt_second_ref /= section->count;
+ section->pcnt_neutral /= section->count;
+ section->pcnt_motion /= section->count;
+ section->MVr /= section->count;
+ section->mvr_abs /= section->count;
+ section->MVc /= section->count;
+ section->mvc_abs /= section->count;
+ section->MVrv /= section->count;
+ section->MVcv /= section->count;
+ section->mv_in_out_count /= section->count;
+ section->duration /= section->count;
+}
+
+// Calculate a modified Error used in distributing bits between easier and harder frames
+static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
+ cpi->twopass.total_stats->count);
+ double this_err = this_frame->ssim_weighted_pred_err;
+ double modified_err;
+
+ if (this_err > av_err)
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+ else
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+ return modified_err;
+}
+
+static const double weight_table[256] = {
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+ 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+ 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+ 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
+static double simple_weight(YV12_BUFFER_CONFIG *source) {
+ int i, j;
+
+ unsigned char *src = source->y_buffer;
+ double sum_weights = 0.0;
+
+ // Loop throught the Y plane raw examining levels and creating a weight for the image
+ i = source->y_height;
+ do {
+ j = source->y_width;
+ do {
+ sum_weights += weight_table[ *src];
+ src++;
+ } while (--j);
+ src -= source->y_width;
+ src += source->y_stride;
+ } while (--i);
+
+ sum_weights /= (source->y_height * source->y_width);
+
+ return sum_weights;
+}
+
+
+// This function returns the current per frame maximum bitrate target
+static int frame_max_bits(VP9_COMP *cpi) {
+ // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
+ int max_bits;
+
+ // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+ max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+ // Trap case where we are out of bits
+ if (max_bits < 0)
+ max_bits = 0;
+
+ return max_bits;
+}
+
+void vp9_init_first_pass(VP9_COMP *cpi) {
+ zero_stats(cpi->twopass.total_stats);
+}
+
+void vp9_end_first_pass(VP9_COMP *cpi) {
+ output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+}
+
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+
+ unsigned char *src_ptr = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ unsigned char *ref_ptr;
+ int ref_stride = d->pre_stride;
+
+ // Set up pointers for this macro block recon buffer
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+ ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
+
+ vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+ (unsigned int *)(best_motion_err));
+}
+
+static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ int_mv *ref_mv, MV *best_mv,
+ YV12_BUFFER_CONFIG *recon_buffer,
+ int *best_motion_err, int recon_yoffset) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ int num00;
+
+ int_mv tmp_mv;
+ int_mv ref_mv_full;
+
+ int tmp_err;
+ int step_param = 3;
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ int n;
+ vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+ int new_mv_mode_penalty = 256;
+
+ // override the default variance function to use MSE
+ v_fn_ptr.vf = vp9_mse16x16;
+
+ // Set up pointers for this macro block recon buffer
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+ // Initial step/diamond search centred on best mv
+ tmp_mv.as_int = 0;
+ ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
+ ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
+ tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+ x->sadperbit16, &num00, &v_fn_ptr,
+ XMVCOST, ref_mv);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty)
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.as_mv.row;
+ best_mv->col = tmp_mv.as_mv.col;
+ }
+
+ // Further step/diamond searches as necessary
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps) {
+ n++;
+
+ if (num00)
+ num00--;
+ else {
+ tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+ step_param + n, x->sadperbit16,
+ &num00, &v_fn_ptr,
+ XMVCOST, ref_mv);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty)
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.as_mv.row;
+ best_mv->col = tmp_mv.as_mv.col;
+ }
+ }
+ }
+}
+
+void vp9_first_pass(VP9_COMP *cpi) {
+ int mb_row, mb_col;
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ int recon_yoffset, recon_uvoffset;
+ YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+ YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+ int recon_y_stride = lst_yv12->y_stride;
+ int recon_uv_stride = lst_yv12->uv_stride;
+ int64_t intra_error = 0;
+ int64_t coded_error = 0;
+ int64_t sr_coded_error = 0;
+
+ int sum_mvr = 0, sum_mvc = 0;
+ int sum_mvr_abs = 0, sum_mvc_abs = 0;
+ int sum_mvrs = 0, sum_mvcs = 0;
+ int mvcount = 0;
+ int intercount = 0;
+ int second_ref_count = 0;
+ int intrapenalty = 256;
+ int neutral_count = 0;
+ int new_mv_count = 0;
+ int sum_in_vectors = 0;
+ uint32_t lastmv_as_int = 0;
+
+ int_mv zero_ref_mv;
+
+ zero_ref_mv.as_int = 0;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ x->src = * cpi->Source;
+ xd->pre = *lst_yv12;
+ xd->dst = *new_yv12;
+
+ x->partition_info = x->pi;
+
+ xd->mode_info_context = cm->mi;
+
+ vp9_build_block_offsets(x);
+
+ vp9_setup_block_dptrs(&x->e_mbd);
+
+ vp9_setup_block_ptrs(x);
+
+ // set up frame new frame for intra coded blocks
+ vp9_setup_intra_recon(new_yv12);
+ vp9_frame_init_quantizer(cpi);
+
+ // Initialise the MV cost table to the defaults
+ // if( cm->current_video_frame == 0)
+ // if ( 0 )
+ {
+ int flag[2] = {1, 1};
+ vp9_init_mv_probs(cm);
+ vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+ }
+
+ // for each macroblock row in image
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ int_mv best_ref_mv;
+
+ best_ref_mv.as_int = 0;
+
+ // reset above block coeffs
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ int this_error;
+ int gf_motion_error = INT_MAX;
+ int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+ xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+ xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ // Copy current mb to a buffer
+ vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ // do intra 16x16 prediction
+ this_error = vp9_encode_intra(cpi, x, use_dc_pred);
+
+ // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
+ // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
+ // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
+
+ // Cumulative intra error total
+ intra_error += (int64_t)this_error;
+
+ // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+
+ // Other than for the first frame do a motion search
+ if (cm->current_video_frame > 0) {
+ int tmp_err;
+ int motion_error = INT_MAX;
+ int_mv mv, tmp_mv;
+
+ // Simple 0,0 motion with no mv overhead
+ zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
+ mv.as_int = tmp_mv.as_int = 0;
+
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search
+ first_pass_motion_search(cpi, x, &best_ref_mv,
+ &mv.as_mv, lst_yv12,
+ &motion_error, recon_yoffset);
+
+ // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ if (best_ref_mv.as_int) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
+ lst_yv12, &tmp_err, recon_yoffset);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv.as_int = tmp_mv.as_int;
+ }
+ }
+
+ // Experimental search in an older reference frame
+ if (cm->current_video_frame > 1) {
+ // Simple 0,0 motion with no mv overhead
+ zz_motion_search(cpi, x, gld_yv12,
+ &gf_motion_error, recon_yoffset);
+
+ first_pass_motion_search(cpi, x, &zero_ref_mv,
+ &tmp_mv.as_mv, gld_yv12,
+ &gf_motion_error, recon_yoffset);
+
+ if ((gf_motion_error < motion_error) &&
+ (gf_motion_error < this_error)) {
+ second_ref_count++;
+ }
+
+ // Reset to last frame as reference buffer
+ xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
+ xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame
+ // take the best of the motion predicted score and
+ // the intra coded error (just as will be done for)
+ // accumulation of "coded_error" for the last frame.
+ if (gf_motion_error < this_error)
+ sr_coded_error += gf_motion_error;
+ else
+ sr_coded_error += this_error;
+ } else
+ sr_coded_error += motion_error;
+
+ /* Intra assumed best */
+ best_ref_mv.as_int = 0;
+
+ if (motion_error <= this_error) {
+ // Keep a count of cases where the inter and intra were
+ // very close and very low. This helps with scene cut
+ // detection for example in cropped clips with black bars
+ // at the sides or top and bottom.
+ if ((((this_error - intrapenalty) * 9) <=
+ (motion_error * 10)) &&
+ (this_error < (2 * intrapenalty))) {
+ neutral_count++;
+ }
+
+ mv.as_mv.row <<= 3;
+ mv.as_mv.col <<= 3;
+ this_error = motion_error;
+ vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
+ xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+ sum_mvr += mv.as_mv.row;
+ sum_mvr_abs += abs(mv.as_mv.row);
+ sum_mvc += mv.as_mv.col;
+ sum_mvc_abs += abs(mv.as_mv.col);
+ sum_mvrs += mv.as_mv.row * mv.as_mv.row;
+ sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+ intercount++;
+
+ best_ref_mv.as_int = mv.as_int;
+
+ // Was the vector non-zero
+ if (mv.as_int) {
+ mvcount++;
+
+ // Was it different from the last non zero vector
+ if (mv.as_int != lastmv_as_int)
+ new_mv_count++;
+ lastmv_as_int = mv.as_int;
+
+ // Does the Row vector point inwards or outwards
+ if (mb_row < cm->mb_rows / 2) {
+ if (mv.as_mv.row > 0)
+ sum_in_vectors--;
+ else if (mv.as_mv.row < 0)
+ sum_in_vectors++;
+ } else if (mb_row > cm->mb_rows / 2) {
+ if (mv.as_mv.row > 0)
+ sum_in_vectors++;
+ else if (mv.as_mv.row < 0)
+ sum_in_vectors--;
+ }
+
+ // Does the Row vector point inwards or outwards
+ if (mb_col < cm->mb_cols / 2) {
+ if (mv.as_mv.col > 0)
+ sum_in_vectors--;
+ else if (mv.as_mv.col < 0)
+ sum_in_vectors++;
+ } else if (mb_col > cm->mb_cols / 2) {
+ if (mv.as_mv.col > 0)
+ sum_in_vectors++;
+ else if (mv.as_mv.col < 0)
+ sum_in_vectors--;
+ }
+ }
+ }
+ } else
+ sr_coded_error += (int64_t)this_error;
+
+ coded_error += (int64_t)this_error;
+
+ // adjust to the next column of macroblocks
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+ }
+
+ // adjust to the next row of mbs
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+ // extend the recon for intra prediction
+ vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+ vp9_clear_system_state(); // __asm emms;
+ }
+
+ vp9_clear_system_state(); // __asm emms;
+ {
+ double weight = 0.0;
+
+ FIRSTPASS_STATS fps;
+
+ fps.frame = cm->current_video_frame;
+ fps.intra_error = intra_error >> 8;
+ fps.coded_error = coded_error >> 8;
+ fps.sr_coded_error = sr_coded_error >> 8;
+ weight = simple_weight(cpi->Source);
+
+
+ if (weight < 0.1)
+ weight = 0.1;
+
+ fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+ fps.pcnt_inter = 0.0;
+ fps.pcnt_motion = 0.0;
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.count = 1.0;
+
+ fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
+ fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+ fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+
+ if (mvcount > 0) {
+ fps.MVr = (double)sum_mvr / (double)mvcount;
+ fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+ fps.MVc = (double)sum_mvc / (double)mvcount;
+ fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+ fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+ fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+ fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+ fps.new_mv_count = new_mv_count;
+
+ fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+ }
+
+ // TODO: handle the case when duration is set to 0, or something less
+ // than the full time between subsequent cpi->source_time_stamp s .
+ fps.duration = cpi->source->ts_end
+ - cpi->source->ts_start;
+
+ // don't want to do output stats with a stack variable!
+ memcpy(cpi->twopass.this_frame_stats,
+ &fps,
+ sizeof(FIRSTPASS_STATS));
+ output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
+ accumulate_stats(cpi->twopass.total_stats, &fps);
+ }
+
+ // Copy the previous Last Frame back into gf and and arf buffers if
+ // the prediction is good enough... but also dont allow it to lag too far
+ if ((cpi->twopass.sr_update_lag > 3) ||
+ ((cm->current_video_frame > 0) &&
+ (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
+ ((cpi->twopass.this_frame_stats->intra_error /
+ cpi->twopass.this_frame_stats->coded_error) > 2.0))) {
+ vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+ cpi->twopass.sr_update_lag = 1;
+ } else
+ cpi->twopass.sr_update_lag++;
+
+ // swap frame pointers so last frame refers to the frame we just compressed
+ vp9_swap_yv12_buffer(lst_yv12, new_yv12);
+ vp8_yv12_extend_frame_borders(lst_yv12);
+
+ // Special case for the first frame. Copy into the GF buffer as a second reference.
+ if (cm->current_video_frame == 0) {
+ vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+ }
+
+
+ // use this to see what the first pass reconstruction looks like
+ if (0) {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
+ fclose(recon_file);
+ }
+
+ cm->current_video_frame++;
+
+}
+
+// Estimate a cost per mb attributable to overheads such as the coding of
+// modes and motion vectors.
+// Currently simplistic in its assumptions for testing.
+//
+
+
+static double bitcost(double prob) {
+ return -(log(prob) / log(2.0));
+}
+
+static long long estimate_modemvcost(VP9_COMP *cpi,
+ FIRSTPASS_STATS *fpstats) {
+ int mv_cost;
+ int mode_cost;
+
+ double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
+ double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
+ double av_intra = (1.0 - av_pct_inter);
+
+ double zz_cost;
+ double motion_cost;
+ double intra_cost;
+
+ zz_cost = bitcost(av_pct_inter - av_pct_motion);
+ motion_cost = bitcost(av_pct_motion);
+ intra_cost = bitcost(av_intra);
+
+ // Estimate of extra bits per mv overhead for mbs
+ // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+ mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
+
+ // Crude estimate of overhead cost from modes
+ // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+ mode_cost =
+ (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
+ (av_pct_motion * motion_cost) +
+ (av_intra * intra_cost)) * cpi->common.MBs) << 9;
+
+ // return mv_cost + mode_cost;
+ // TODO PGW Fix overhead costs for extended Q range
+ return 0;
+}
+
+static double calc_correction_factor(double err_per_mb,
+ double err_divisor,
+ double pt_low,
+ double pt_high,
+ int Q) {
+ double power_term;
+ double error_term = err_per_mb / err_divisor;
+ double correction_factor;
+
+ // Adjustment based on actual quantizer to power term.
+ power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
+ power_term = (power_term > pt_high) ? pt_high : power_term;
+
+ // Adjustments to error term
+ // TBD
+
+ // Calculate correction factor
+ correction_factor = pow(error_term, power_term);
+
+ // Clip range
+ correction_factor =
+ (correction_factor < 0.05)
+ ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;
+
+ return correction_factor;
+}
+
+// Given a current maxQ value sets a range for future values.
+// PGW TODO..
+// This code removes direct dependency on QIndex to determin the range
+// (now uses the actual quantizer) but has not been tuned.
+static void adjust_maxq_qrange(VP9_COMP *cpi) {
+ int i;
+ double q;
+
+ // Set the max corresponding to cpi->avg_q * 2.0
+ q = cpi->avg_q * 2.0;
+ cpi->twopass.maxq_max_limit = cpi->worst_quality;
+ for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
+ cpi->twopass.maxq_max_limit = i;
+ if (vp9_convert_qindex_to_q(i) >= q)
+ break;
+ }
+
+ // Set the min corresponding to cpi->avg_q * 0.5
+ q = cpi->avg_q * 0.5;
+ cpi->twopass.maxq_min_limit = cpi->best_quality;
+ for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
+ cpi->twopass.maxq_min_limit = i;
+ if (vp9_convert_qindex_to_q(i) <= q)
+ break;
+ }
+}
+
+static int estimate_max_q(VP9_COMP *cpi,
+ FIRSTPASS_STATS *fpstats,
+ int section_target_bandwitdh,
+ int overhead_bits) {
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb;
+
+ double section_err = (fpstats->coded_error / fpstats->count);
+ double sr_err_diff;
+ double sr_correction;
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double speed_correction = 1.0;
+ int overhead_bits_per_mb;
+
+ if (section_target_bandwitdh <= 0)
+ return cpi->twopass.maxq_max_limit; // Highest value allowed
+
+ target_norm_bits_per_mb =
+ (section_target_bandwitdh < (1 << 20))
+ ? (512 * section_target_bandwitdh) / num_mbs
+ : 512 * (section_target_bandwitdh / num_mbs);
+
+ // Look at the drop in prediction quality between the last frame
+ // and the GF buffer (which contained an older frame).
+ sr_err_diff =
+ (fpstats->sr_coded_error - fpstats->coded_error) /
+ (fpstats->count * cpi->common.MBs);
+ sr_correction = (sr_err_diff / 32.0);
+ sr_correction = pow(sr_correction, 0.25);
+ if (sr_correction < 0.75)
+ sr_correction = 0.75;
+ else if (sr_correction > 1.25)
+ sr_correction = 1.25;
+
+ // Calculate a corrective factor based on a rolling ratio of bits spent
+ // vs target bits
+ if ((cpi->rolling_target_bits > 0) &&
+ (cpi->active_worst_quality < cpi->worst_quality)) {
+ double rolling_ratio;
+
+ rolling_ratio = (double)cpi->rolling_actual_bits /
+ (double)cpi->rolling_target_bits;
+
+ if (rolling_ratio < 0.95)
+ cpi->twopass.est_max_qcorrection_factor -= 0.005;
+ else if (rolling_ratio > 1.05)
+ cpi->twopass.est_max_qcorrection_factor += 0.005;
+
+ cpi->twopass.est_max_qcorrection_factor =
+ (cpi->twopass.est_max_qcorrection_factor < 0.1)
+ ? 0.1
+ : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+ ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+ }
+
+ // Corrections for higher compression speed settings
+ // (reduced compression expected)
+ if (cpi->compressor_speed == 1) {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ // Estimate of overhead bits per mb
+ // Correction to overhead bits for min allowed Q.
+ // PGW TODO.. This code is broken for the extended Q range
+ // for now overhead set to 0.
+ overhead_bits_per_mb = overhead_bits / num_mbs;
+ overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+ int bits_per_mb_at_this_q;
+
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
+ sr_correction * speed_correction *
+ cpi->twopass.est_max_qcorrection_factor;
+
+ if (err_correction_factor < 0.05)
+ err_correction_factor = 0.05;
+ else if (err_correction_factor > 5.0)
+ err_correction_factor = 5.0;
+
+ bits_per_mb_at_this_q =
+ vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
+
+ bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
+ (double)bits_per_mb_at_this_q);
+
+ // Mode and motion overhead
+ // As Q rises in real encode loop rd code will force overhead down
+ // We make a crude adjustment for this here as *.98 per Q step.
+ // PGW TODO.. This code is broken for the extended Q range
+ // for now overhead set to 0.
+ // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ // Restriction on active max q for constrained quality mode.
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < cpi->cq_target_quality)) {
+ Q = cpi->cq_target_quality;
+ }
+
+ // Adjust maxq_min_limit and maxq_max_limit limits based on
+ // averaga q observed in clip for non kf/gf/arf frames
+ // Give average a chance to settle though.
+ // PGW TODO.. This code is broken for the extended Q range
+ if ((cpi->ni_frames >
+ ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&
+ (cpi->ni_frames > 150)) {
+ adjust_maxq_qrange(cpi);
+ }
+
+ return Q;
+}
+
+// For cq mode estimate a cq level that matches the observed
+// complexity and data rate.
+static int estimate_cq(VP9_COMP *cpi,
+ FIRSTPASS_STATS *fpstats,
+ int section_target_bandwitdh,
+ int overhead_bits) {
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb;
+
+ double section_err = (fpstats->coded_error / fpstats->count);
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double sr_err_diff;
+ double sr_correction;
+ double speed_correction = 1.0;
+ double clip_iiratio;
+ double clip_iifactor;
+ int overhead_bits_per_mb;
+
+
+ target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+ ? (512 * section_target_bandwitdh) / num_mbs
+ : 512 * (section_target_bandwitdh / num_mbs);
+
+ // Estimate of overhead bits per mb
+ overhead_bits_per_mb = overhead_bits / num_mbs;
+
+ // Corrections for higher compression speed settings
+ // (reduced compression expected)
+ if (cpi->compressor_speed == 1) {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ // Look at the drop in prediction quality between the last frame
+ // and the GF buffer (which contained an older frame).
+ sr_err_diff =
+ (fpstats->sr_coded_error - fpstats->coded_error) /
+ (fpstats->count * cpi->common.MBs);
+ sr_correction = (sr_err_diff / 32.0);
+ sr_correction = pow(sr_correction, 0.25);
+ if (sr_correction < 0.75)
+ sr_correction = 0.75;
+ else if (sr_correction > 1.25)
+ sr_correction = 1.25;
+
+ // II ratio correction factor for clip as a whole
+ clip_iiratio = cpi->twopass.total_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+ clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+ if (clip_iifactor < 0.80)
+ clip_iifactor = 0.80;
+
+ // Try and pick a Q that can encode the content at the given rate.
+ for (Q = 0; Q < MAXQ; Q++) {
+ int bits_per_mb_at_this_q;
+
+ // Error per MB based correction factor
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+ sr_correction * speed_correction * clip_iifactor;
+
+ if (err_correction_factor < 0.05)
+ err_correction_factor = 0.05;
+ else if (err_correction_factor > 5.0)
+ err_correction_factor = 5.0;
+
+ bits_per_mb_at_this_q =
+ vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;
+
+ bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *
+ (double)bits_per_mb_at_this_q);
+
+ // Mode and motion overhead
+ // As Q rises in real encode loop rd code will force overhead down
+ // We make a crude adjustment for this here as *.98 per Q step.
+ // PGW TODO.. This code is broken for the extended Q range
+ // for now overhead set to 0.
+ overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ // Clip value to range "best allowed to (worst allowed - 1)"
+ Q = select_cq_level(Q);
+ if (Q >= cpi->worst_quality)
+ Q = cpi->worst_quality - 1;
+ if (Q < cpi->best_quality)
+ Q = cpi->best_quality;
+
+ return Q;
+}
+
+
+extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
+
+void vp9_init_second_pass(VP9_COMP *cpi) {
+ FIRSTPASS_STATS this_frame;
+ FIRSTPASS_STATS *start_pos;
+
+ double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+ * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ if (two_pass_min_rate < lower_bounds_min_rate)
+ two_pass_min_rate = lower_bounds_min_rate;
+
+ zero_stats(cpi->twopass.total_stats);
+ zero_stats(cpi->twopass.total_left_stats);
+
+ if (!cpi->twopass.stats_in_end)
+ return;
+
+ *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+ *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+
+ // each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However the sum duration is not.
+ // Its calculated based on the actual durations of all frames from the first
+ // pass.
+ vp9_new_frame_rate(cpi,
+ 10000000.0 * cpi->twopass.total_stats->count /
+ cpi->twopass.total_stats->duration);
+
+ cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+ cpi->oxcf.target_bandwidth / 10000000.0);
+ cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+ two_pass_min_rate / 10000000.0);
+
+ // Calculate a minimum intra value to be used in determining the IIratio
+ // scores used in the second pass. We have this minimum to make sure
+ // that clips that are static but "low complexity" in the intra domain
+ // are still boosted appropriately for KF/GF/ARF
+ cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+ cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+
+ // This variable monitors how far behind the second ref update is lagging
+ cpi->twopass.sr_update_lag = 1;
+
+ // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+ {
+ double sum_iiratio = 0.0;
+ double IIRatio;
+
+ start_pos = cpi->twopass.stats_in; // Note starting "file" position
+
+ while (input_stats(cpi, &this_frame) != EOF) {
+ IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+ IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+ sum_iiratio += IIRatio;
+ }
+
+ cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+
+ // Reset file position
+ reset_fpf_position(cpi, start_pos);
+ }
+
+ // Scan the first pass file and calculate a modified total error based upon the bias/power function
+ // used to allocate bits
+ {
+ start_pos = cpi->twopass.stats_in; // Note starting "file" position
+
+ cpi->twopass.modified_error_total = 0.0;
+ cpi->twopass.modified_error_used = 0.0;
+
+ while (input_stats(cpi, &this_frame) != EOF) {
+ cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+ }
+ cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+
+ reset_fpf_position(cpi, start_pos); // Reset file position
+
+ }
+}
+
+void vp9_end_second_pass(VP9_COMP *cpi) {
+}
+
+// This function gives and estimate of how badly we believe
+// the prediction quality is decaying from frame to frame.
+static double get_prediction_decay_rate(VP9_COMP *cpi,
+ FIRSTPASS_STATS *next_frame) {
+ double prediction_decay_rate;
+ double second_ref_decay;
+ double mb_sr_err_diff;
+
+ // Initial basis is the % mbs inter coded
+ prediction_decay_rate = next_frame->pcnt_inter;
+
+ // Look at the observed drop in prediction quality between the last frame
+ // and the GF buffer (which contains an older frame).
+ mb_sr_err_diff =
+ (next_frame->sr_coded_error - next_frame->coded_error) /
+ (cpi->common.MBs);
+ second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
+ second_ref_decay = pow(second_ref_decay, 0.5);
+ if (second_ref_decay < 0.85)
+ second_ref_decay = 0.85;
+ else if (second_ref_decay > 1.0)
+ second_ref_decay = 1.0;
+
+ if (second_ref_decay < prediction_decay_rate)
+ prediction_decay_rate = second_ref_decay;
+
+ return prediction_decay_rate;
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(
+ VP9_COMP *cpi,
+ int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double last_decay_rate) {
+ BOOL trans_to_still = FALSE;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if ((frame_interval > MIN_GF_INTERVAL) &&
+ (loop_decay_rate >= 0.999) &&
+ (last_decay_rate < 0.9)) {
+ int j;
+ FIRSTPASS_STATS *position = cpi->twopass.stats_in;
+ FIRSTPASS_STATS tmp_next_frame;
+ double zz_inter;
+
+ // Look ahead a few frames to see if static condition
+ // persists...
+ for (j = 0; j < still_interval; j++) {
+ if (EOF == input_stats(cpi, &tmp_next_frame))
+ break;
+
+ zz_inter =
+ (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
+ if (zz_inter < 0.999)
+ break;
+ }
+ // Reset file position
+ reset_fpf_position(cpi, position);
+
+ // Only if it does do we signal a transition to still
+ if (j == still_interval)
+ trans_to_still = TRUE;
+ }
+
+ return trans_to_still;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this
+static BOOL detect_flash(VP9_COMP *cpi, int offset) {
+ FIRSTPASS_STATS next_frame;
+
+ BOOL flash_detected = FALSE;
+
+ // Read the frame data.
+ // The return is FALSE (no flash detected) if not a valid frame
+ if (read_frame_stats(cpi, &next_frame, offset) != EOF) {
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // comapred to pcnt_inter.
+ if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
+ (next_frame.pcnt_second_ref >= 0.5)) {
+ flash_detected = TRUE;
+ }
+ }
+
+ return flash_detected;
+}
+
+// Update the motion related elements to the GF arf boost calculation
+static void accumulate_frame_motion_stats(
+ VP9_COMP *cpi,
+ FIRSTPASS_STATS *this_frame,
+ double *this_frame_mv_in_out,
+ double *mv_in_out_accumulator,
+ double *abs_mv_in_out_accumulator,
+ double *mv_ratio_accumulator) {
+ // double this_frame_mv_in_out;
+ double this_frame_mvr_ratio;
+ double this_frame_mvc_ratio;
+ double motion_pct;
+
+ // Accumulate motion stats.
+ motion_pct = this_frame->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats
+ *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
+ *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
+ *abs_mv_in_out_accumulator +=
+ fabs(this_frame->mv_in_out_count * motion_pct);
+
+ // Accumulate a measure of how uniform (or conversely how random)
+ // the motion field is. (A ratio of absmv / mv)
+ if (motion_pct > 0.05) {
+ this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+ DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+
+ this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+ DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
+
+ *mv_ratio_accumulator +=
+ (this_frame_mvr_ratio < this_frame->mvr_abs)
+ ? (this_frame_mvr_ratio * motion_pct)
+ : this_frame->mvr_abs * motion_pct;
+
+ *mv_ratio_accumulator +=
+ (this_frame_mvc_ratio < this_frame->mvc_abs)
+ ? (this_frame_mvc_ratio * motion_pct)
+ : this_frame->mvc_abs * motion_pct;
+
+ }
+}
+
+// Calculate a baseline boost number for the current frame.
+static double calc_frame_boost(
+ VP9_COMP *cpi,
+ FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out) {
+ double frame_boost;
+
+ // Underlying boost factor is based on inter intra error ratio
+ if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+ frame_boost = (IIFACTOR * this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+ else
+ frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+
+ // Increase boost for frames where new data coming into frame
+ // (eg zoom out). Slightly reduce boost if there is a net balance
+ // of motion out of the frame (zoom in).
+ // The range for this_frame_mv_in_out is -1.0 to +1.0
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In extreme case boost is halved
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ // Clip to maximum
+ if (frame_boost > GF_RMAX)
+ frame_boost = GF_RMAX;
+
+ return frame_boost;
+}
+
+static int calc_arf_boost(
+ VP9_COMP *cpi,
+ int offset,
+ int f_frames,
+ int b_frames,
+ int *f_boost,
+ int *b_boost) {
+ FIRSTPASS_STATS this_frame;
+
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ int arf_boost;
+ BOOL flash_detected = FALSE;
+
+ // Search forward from the proposed arf/next gf position
+ for (i = 0; i < f_frames; i++) {
+ if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+ break;
+
+ // Update the motion related elements to the boost calculation
+ accumulate_frame_motion_stats(cpi, &this_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(cpi, (i + offset)) ||
+ detect_flash(cpi, (i + offset + 1));
+
+ // Cumulative effect of prediction quality decay
+ if (!flash_detected) {
+ decay_accumulator =
+ decay_accumulator *
+ get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator =
+ decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+
+ boost_score += (decay_accumulator *
+ calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+ }
+
+ *f_boost = boost_score;
+
+ // Reset for backward looking loop
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ // Search backward towards last gf position
+ for (i = -1; i >= -b_frames; i--) {
+ if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+ break;
+
+ // Update the motion related elements to the boost calculation
+ accumulate_frame_motion_stats(cpi, &this_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(cpi, (i + offset)) ||
+ detect_flash(cpi, (i + offset + 1));
+
+ // Cumulative effect of prediction quality decay
+ if (!flash_detected) {
+ decay_accumulator =
+ decay_accumulator *
+ get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator =
+ decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+
+ boost_score += (decay_accumulator *
+ calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
+
+ }
+ *b_boost = boost_score;
+
+ arf_boost = (*f_boost + *b_boost);
+ if (arf_boost < ((b_frames + f_frames) * 20))
+ arf_boost = ((b_frames + f_frames) * 20);
+
+ return arf_boost;
+}
+
+static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int half_gf_int;
+ int frames_after_arf;
+ int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+ int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
+ // Define the arnr filter width for this group of frames:
+ // We only filter frames that lie within a distance of half
+ // the GF interval from the ARF frame. We also have to trap
+ // cases where the filter extends beyond the end of clip.
+ // Note: this_frame->frame has been updated in the loop
+ // so it now points at the ARF frame.
+ half_gf_int = cpi->baseline_gf_interval >> 1;
+ frames_after_arf = cpi->twopass.total_stats->count -
+ this_frame->frame - 1;
+
+ switch (cpi->oxcf.arnr_type) {
+ case 1: // Backward filter
+ frames_fwd = 0;
+ if (frames_bwd > half_gf_int)
+ frames_bwd = half_gf_int;
+ break;
+
+ case 2: // Forward filter
+ if (frames_fwd > half_gf_int)
+ frames_fwd = half_gf_int;
+ if (frames_fwd > frames_after_arf)
+ frames_fwd = frames_after_arf;
+ frames_bwd = 0;
+ break;
+
+ case 3: // Centered filter
+ default:
+ frames_fwd >>= 1;
+ if (frames_fwd > frames_after_arf)
+ frames_fwd = frames_after_arf;
+ if (frames_fwd > half_gf_int)
+ frames_fwd = half_gf_int;
+
+ frames_bwd = frames_fwd;
+
+ // For even length filter there is one more frame backward
+ // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+ if (frames_bwd < half_gf_int)
+ frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+ break;
+ }
+
+ cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+}
+
+// Analyse and define a gf/arf group .
+static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_pos;
+ int i;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double gf_group_err = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double zero_motion_accumulator = 1.0;
+
+ double loop_decay_rate = 1.00; // Starting decay rate
+ double last_loop_decay_rate = 1.00;
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+
+ int max_bits = frame_max_bits(cpi); // Max for a single frame
+
+ unsigned int allow_alt_ref =
+ cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
+ int f_boost = 0;
+ int b_boost = 0;
+ BOOL flash_detected;
+
+ cpi->twopass.gf_group_bits = 0;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ start_pos = cpi->twopass.stats_in;
+
+ vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+
+ // Load stats for the current frame.
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+ // Note the error of the frame at the start of the group (this will be
+ // the GF frame error if we code a normal gf
+ gf_first_frame_err = mod_frame_err;
+
+ // Special treatment if the current frame is a key frame (which is also
+ // a gf). If it is then its error score (and hence bit allocation) need
+ // to be subtracted out from the calculation for the GF group
+ if (cpi->common.frame_type == KEY_FRAME)
+ gf_group_err -= gf_first_frame_err;
+
+ // Scan forward to try and work out how many frames the next gf group
+ // should contain and what level of boost is appropriate for the GF
+ // or ARF that will be coded with the group
+ i = 0;
+
+ while (((i < cpi->twopass.static_scene_max_gf_interval) ||
+ ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
+ (i < cpi->twopass.frames_to_key)) {
+ i++; // Increment the loop counter
+
+ // Accumulate error score of frames in this gf group
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+ gf_group_err += mod_frame_err;
+
+ if (EOF == input_stats(cpi, &next_frame))
+ break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(cpi, 0);
+
+ // Update the motion related elements to the boost calculation
+ accumulate_frame_motion_stats(cpi, &next_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // Cumulative effect of prediction quality decay
+ if (!flash_detected) {
+ last_loop_decay_rate = loop_decay_rate;
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+
+ // Monitor for static sections.
+ if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
+ zero_motion_accumulator) {
+ zero_motion_accumulator =
+ (next_frame.pcnt_inter - next_frame.pcnt_motion);
+ }
+
+ // Break clause to detect very still sections after motion
+ // (for example a staic image after a fade or other transition).
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+ last_loop_decay_rate)) {
+ allow_alt_ref = FALSE;
+ break;
+ }
+ }
+
+ // Calculate a boost number for this frame
+ boost_score +=
+ (decay_accumulator *
+ calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
+
+ // Break out conditions.
+ if (
+ // Break at cpi->max_gf_interval unless almost totally static
+ (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||
+ (
+ // Dont break out with a very short interval
+ (i > MIN_GF_INTERVAL) &&
+ // Dont break out very close to a key frame
+ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
+ ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
+ (!flash_detected) &&
+ ((mv_ratio_accumulator > 100.0) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ||
+ ((boost_score - old_boost_score) < 12.5))
+ )) {
+ boost_score = old_boost_score;
+ break;
+ }
+
+ vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+ old_boost_score = boost_score;
+ }
+
+ // Dont allow a gf too near the next kf
+ if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
+ while (i < cpi->twopass.frames_to_key) {
+ i++;
+
+ if (EOF == input_stats(cpi, this_frame))
+ break;
+
+ if (i < cpi->twopass.frames_to_key) {
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+ gf_group_err += mod_frame_err;
+ }
+ }
+ }
+
+ // Set the interval till the next gf or arf.
+ cpi->baseline_gf_interval = i;
+
+ // Should we use the alternate refernce frame
+ if (allow_alt_ref &&
+ (i < cpi->oxcf.lag_in_frames) &&
+ (i >= MIN_GF_INTERVAL) &&
+ // dont use ARF very near next kf
+ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
+ ((next_frame.pcnt_inter > 0.75) ||
+ (next_frame.pcnt_second_ref > 0.5)) &&
+ ((mv_in_out_accumulator / (double)i > -0.2) ||
+ (mv_in_out_accumulator > -2.0)) &&
+ (boost_score > 100)) {
+ // Alterrnative boost calculation for alt ref
+ cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ cpi->source_alt_ref_pending = TRUE;
+
+ configure_arnr_filter(cpi, this_frame);
+ } else {
+ cpi->gfu_boost = (int)boost_score;
+ cpi->source_alt_ref_pending = FALSE;
+ }
+
+ // Now decide how many bits should be allocated to the GF group as a
+ // proportion of those remaining in the kf group.
+ // The final key frame group in the clip is treated as a special case
+ // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+ // This is also important for short clips where there may only be one
+ // key frame.
+ if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+ cpi->common.current_video_frame)) {
+ cpi->twopass.kf_group_bits =
+ (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
+ }
+
+ // Calculate the bits to be allocated to the group as a whole
+ if ((cpi->twopass.kf_group_bits > 0) &&
+ (cpi->twopass.kf_group_error_left > 0)) {
+ cpi->twopass.gf_group_bits =
+ (int)((double)cpi->twopass.kf_group_bits *
+ (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+ } else
+ cpi->twopass.gf_group_bits = 0;
+
+ cpi->twopass.gf_group_bits =
+ (cpi->twopass.gf_group_bits < 0)
+ ? 0
+ : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+ ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+
+ // Clip cpi->twopass.gf_group_bits based on user supplied data rate
+ // variability limit (cpi->oxcf.two_pass_vbrmax_section)
+ if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
+ cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+ // Reset the file position
+ reset_fpf_position(cpi, start_pos);
+
+ // Update the record of error used so far (only done once per gf group)
+ cpi->twopass.modified_error_used += gf_group_err;
+
+ // Assign bits to the arf or gf.
+ for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
+ int boost;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int gf_bits;
+
+ boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+
+ // Set max and minimum boost and hence minimum allocation
+ if (boost > ((cpi->baseline_gf_interval + 1) * 200))
+ boost = ((cpi->baseline_gf_interval + 1) * 200);
+ else if (boost < 125)
+ boost = 125;
+
+ if (cpi->source_alt_ref_pending && i == 0)
+ allocation_chunks =
+ ((cpi->baseline_gf_interval + 1) * 100) + boost;
+ else
+ allocation_chunks =
+ (cpi->baseline_gf_interval * 100) + (boost - 100);
+
+ // Prevent overflow
+ if (boost > 1028) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of bits to be spent on the gf or arf based on
+ // the boost number
+ gf_bits = (int)((double)boost *
+ (cpi->twopass.gf_group_bits /
+ (double)allocation_chunks));
+
+ // If the frame that is to be boosted is simpler than the average for
+ // the gf/arf group then use an alternative calculation
+ // based on the error score of the frame itself
+ if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
+ double alt_gf_grp_bits;
+ int alt_gf_bits;
+
+ alt_gf_grp_bits =
+ (double)cpi->twopass.kf_group_bits *
+ (mod_frame_err * (double)cpi->baseline_gf_interval) /
+ DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
+
+ alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+ (double)allocation_chunks));
+
+ if (gf_bits > alt_gf_bits) {
+ gf_bits = alt_gf_bits;
+ }
+ }
+ // Else if it is harder than other frames in the group make sure it at
+ // least receives an allocation in keeping with its relative error
+ // score, otherwise it may be worse off than an "un-boosted" frame
+ else {
+ int alt_gf_bits =
+ (int)((double)cpi->twopass.kf_group_bits *
+ mod_frame_err /
+ DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+
+ if (alt_gf_bits > gf_bits) {
+ gf_bits = alt_gf_bits;
+ }
+ }
+
+ // Dont allow a negative value for gf_bits
+ if (gf_bits < 0)
+ gf_bits = 0;
+
+ gf_bits += cpi->min_frame_bandwidth; // Add in minimum for a frame
+
+ if (i == 0) {
+ cpi->twopass.gf_bits = gf_bits;
+ }
+ if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {
+ cpi->per_frame_bandwidth = gf_bits; // Per frame bit target for this frame
+ }
+ }
+
+ {
+ // Adjust KF group bits and error remainin
+ cpi->twopass.kf_group_error_left -= gf_group_err;
+ cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+
+ if (cpi->twopass.kf_group_bits < 0)
+ cpi->twopass.kf_group_bits = 0;
+
+ // Note the error score left in the remaining frames of the group.
+ // For normal GFs we want to remove the error score for the first frame
+ // of the group (except in Key frame case where this has already
+ // happened)
+ if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+ cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;
+ else
+ cpi->twopass.gf_group_error_left = gf_group_err;
+
+ cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0;
+
+ // This condition could fail if there are two kfs very close together
+ // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+ // calculation of cpi->twopass.alt_extra_bits.
+ if (cpi->baseline_gf_interval >= 3) {
+ int boost = (cpi->source_alt_ref_pending)
+ ? b_boost : cpi->gfu_boost;
+
+ if (boost >= 150) {
+ int pct_extra;
+
+ pct_extra = (boost - 100) / 50;
+ pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+ cpi->twopass.alt_extra_bits =
+ (cpi->twopass.gf_group_bits * pct_extra) / 100;
+ cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
+ cpi->twopass.alt_extra_bits /=
+ ((cpi->baseline_gf_interval - 1) >> 1);
+ } else
+ cpi->twopass.alt_extra_bits = 0;
+ } else
+ cpi->twopass.alt_extra_bits = 0;
+ }
+
+ if (cpi->common.frame_type != KEY_FRAME) {
+ FIRSTPASS_STATS sectionstats;
+
+ zero_stats(§ionstats);
+ reset_fpf_position(cpi, start_pos);
+
+ for (i = 0; i < cpi->baseline_gf_interval; i++) {
+ input_stats(cpi, &next_frame);
+ accumulate_stats(§ionstats, &next_frame);
+ }
+
+ avg_stats(§ionstats);
+
+ cpi->twopass.section_intra_rating =
+ sectionstats.intra_error /
+ DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+ reset_fpf_position(cpi, start_pos);
+ }
+}
+
+// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
+static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int target_frame_size; // gf_group_error_left
+
+ double modified_err;
+ double err_fraction; // What portion of the remaining GF group error is used by this frame
+
+ int max_bits = frame_max_bits(cpi); // Max for a single frame
+
+ // Calculate modified prediction error used in bit allocation
+ modified_err = calculate_modified_err(cpi, this_frame);
+
+ if (cpi->twopass.gf_group_error_left > 0)
+ err_fraction = modified_err / cpi->twopass.gf_group_error_left; // What portion of the remaining GF group error is used by this frame
+ else
+ err_fraction = 0.0;
+
+ target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction); // How many of those bits available for allocation should we give it?
+
+ // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
+ if (target_frame_size < 0)
+ target_frame_size = 0;
+ else {
+ if (target_frame_size > max_bits)
+ target_frame_size = max_bits;
+
+ if (target_frame_size > cpi->twopass.gf_group_bits)
+ target_frame_size = cpi->twopass.gf_group_bits;
+ }
+
+ cpi->twopass.gf_group_error_left -= modified_err; // Adjust error remaining
+ cpi->twopass.gf_group_bits -= target_frame_size; // Adjust bits remaining
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0;
+
+ target_frame_size += cpi->min_frame_bandwidth; // Add in the minimum number of bits that is set aside for every frame.
+
+
+ cpi->per_frame_bandwidth = target_frame_size; // Per frame bit target for this frame
+}
+
+// Make a damped adjustment to the active max q.
+static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
+ int i;
+ int ret_val = new_maxqi;
+ double old_q;
+ double new_q;
+ double target_q;
+
+ old_q = vp9_convert_qindex_to_q(old_maxqi);
+ new_q = vp9_convert_qindex_to_q(new_maxqi);
+
+ target_q = ((old_q * 7.0) + new_q) / 8.0;
+
+ if (target_q > old_q) {
+ for (i = old_maxqi; i <= new_maxqi; i++) {
+ if (vp9_convert_qindex_to_q(i) >= target_q) {
+ ret_val = i;
+ break;
+ }
+ }
+ } else {
+ for (i = old_maxqi; i >= new_maxqi; i--) {
+ if (vp9_convert_qindex_to_q(i) <= target_q) {
+ ret_val = i;
+ break;
+ }
+ }
+ }
+
+ return ret_val;
+}
+
+void vp9_second_pass(VP9_COMP *cpi) {
+ int tmp_q;
+ int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
+
+ FIRSTPASS_STATS this_frame;
+ FIRSTPASS_STATS this_frame_copy;
+
+ double this_frame_error;
+ double this_frame_intra_error;
+ double this_frame_coded_error;
+
+ FIRSTPASS_STATS *start_pos;
+
+ int overhead_bits;
+
+ if (!cpi->twopass.stats_in) {
+ return;
+ }
+
+ vp9_clear_system_state();
+
+ vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));
+
+ if (EOF == input_stats(cpi, &this_frame))
+ return;
+
+ this_frame_error = this_frame.ssim_weighted_pred_err;
+ this_frame_intra_error = this_frame.intra_error;
+ this_frame_coded_error = this_frame.coded_error;
+
+ start_pos = cpi->twopass.stats_in;
+
+ // keyframe and section processing !
+ if (cpi->twopass.frames_to_key == 0) {
+ // Define next KF group and assign bits to it
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ find_next_key_frame(cpi, &this_frame_copy);
+ }
+
+ // Is this a GF / ARF (Note that a KF is always also a GF)
+ if (cpi->frames_till_gf_update_due == 0) {
+ // Define next gf group and assign bits to it
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ define_gf_group(cpi, &this_frame_copy);
+
+ // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
+ // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
+ // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
+ if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
+ // Assign a standard frames worth of bits from those allocated to the GF group
+ int bak = cpi->per_frame_bandwidth;
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ cpi->per_frame_bandwidth = bak;
+ }
+ }
+
+ // Otherwise this is an ordinary frame
+ else {
+ // Assign bits from those allocated to the GF group
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ }
+
+ // Keep a globally available copy of this and the next frame's iiratio.
+ cpi->twopass.this_iiratio = this_frame_intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame_coded_error);
+ {
+ FIRSTPASS_STATS next_frame;
+ if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {
+ cpi->twopass.next_iiratio = next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
+ }
+ }
+
+ // Set nominal per second bandwidth for this frame
+ cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
+ if (cpi->target_bandwidth < 0)
+ cpi->target_bandwidth = 0;
+
+
+ // Account for mv, mode and other overheads.
+ overhead_bits = estimate_modemvcost(
+ cpi, cpi->twopass.total_left_stats);
+
+ // Special case code for first frame.
+ if (cpi->common.current_video_frame == 0) {
+ cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+ // Set a cq_level in constrained quality mode.
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ int est_cq;
+
+ est_cq =
+ estimate_cq(cpi,
+ cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits);
+
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
+ if (est_cq > cpi->cq_target_quality)
+ cpi->cq_target_quality = est_cq;
+ }
+
+ // guess at maxq needed in 2nd pass
+ cpi->twopass.maxq_max_limit = cpi->worst_quality;
+ cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+ tmp_q = estimate_max_q(
+ cpi,
+ cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits);
+
+ cpi->active_worst_quality = tmp_q;
+ cpi->ni_av_qi = tmp_q;
+ cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
+
+ // Limit the maxq value returned subsequently.
+ // This increases the risk of overspend or underspend if the initial
+ // estimate for the clip is bad, but helps prevent excessive
+ // variation in Q, especially near the end of a clip
+ // where for example a small overspend may cause Q to crash
+ adjust_maxq_qrange(cpi);
+ }
+
+ // The last few frames of a clip almost always have to few or too many
+ // bits and for the sake of over exact rate control we dont want to make
+ // radical adjustments to the allowed quantizer range just to use up a
+ // few surplus bits or get beneath the target rate.
+ else if ((cpi->common.current_video_frame <
+ (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+ ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+ (unsigned int)cpi->twopass.total_stats->count)) {
+ if (frames_left < 1)
+ frames_left = 1;
+
+ tmp_q = estimate_max_q(
+ cpi,
+ cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits);
+
+ // Make a damped adjustment to active max Q
+ cpi->active_worst_quality =
+ adjust_active_maxq(cpi->active_worst_quality, tmp_q);
+ }
+
+ cpi->twopass.frames_to_key--;
+
+ // Update the total stats remaining sturcture
+ subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+}
+
+
+static BOOL test_candidate_kf(VP9_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {
+ BOOL is_viable_kf = FALSE;
+
+ // Does the frame satisfy the primary criteria of a key frame
+ // If so, then examine how well it predicts subsequent frames
+ if ((this_frame->pcnt_second_ref < 0.10) &&
+ (next_frame->pcnt_second_ref < 0.10) &&
+ ((this_frame->pcnt_inter < 0.05) ||
+ (
+ ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+ ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+ ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+ )
+ )
+ )
+ ) {
+ int i;
+ FIRSTPASS_STATS *start_pos;
+
+ FIRSTPASS_STATS local_next_frame;
+
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+ double next_iiratio;
+
+ vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+ // Note the starting file position so we can reset to it
+ start_pos = cpi->twopass.stats_in;
+
+ // Examine how well the key frame predicts subsequent frames
+ for (i = 0; i < 16; i++) {
+ next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+ if (next_iiratio > RMAX)
+ next_iiratio = RMAX;
+
+ // Cumulative effect of decay in prediction quality
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+ else
+ decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+ // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+
+ // Keep a running total
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses
+ if ((local_next_frame.pcnt_inter < 0.05) ||
+ (next_iiratio < 1.5) ||
+ (((local_next_frame.pcnt_inter -
+ local_next_frame.pcnt_neutral) < 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame.intra_error < 200)
+ ) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (EOF == input_stats(cpi, &local_next_frame))
+ break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+ if (boost_score > 30.0 && (i > 3))
+ is_viable_kf = TRUE;
+ else {
+ // Reset the file position
+ reset_fpf_position(cpi, start_pos);
+
+ is_viable_kf = FALSE;
+ }
+ }
+
+ return is_viable_kf;
+}
+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int i, j;
+ FIRSTPASS_STATS last_frame;
+ FIRSTPASS_STATS first_frame;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_position;
+
+ double decay_accumulator = 1.0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0;
+ double old_boost_score = 0.0;
+ double loop_decay_rate;
+
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double kf_group_intra_err = 0.0;
+ double kf_group_coded_err = 0.0;
+ double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+ vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+
+ vp9_clear_system_state(); // __asm emms;
+ start_position = cpi->twopass.stats_in;
+
+ cpi->common.frame_type = KEY_FRAME;
+
+ // is this a forced key frame by interval
+ cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
+ // Clear the alt ref active flag as this can never be active on a key frame
+ cpi->source_alt_ref_active = FALSE;
+
+ // Kf is always a gf so clear frames till next gf counter
+ cpi->frames_till_gf_update_due = 0;
+
+ cpi->twopass.frames_to_key = 1;
+
+ // Take a copy of the initial frame details
+ vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+ cpi->twopass.kf_group_bits = 0; // Total bits avaialable to kf group
+ cpi->twopass.kf_group_error_left = 0; // Group modified error score.
+
+ kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+ // find the next keyframe
+ i = 0;
+ while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
+ // Accumulate kf group error
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ // These figures keep intra and coded error counts for all frames including key frames in the group.
+ // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+
+ // load a the next frame's stats
+ vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+ input_stats(cpi, this_frame);
+
+ // Provided that we are not at the end of the file...
+ if (cpi->oxcf.auto_key
+ && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
+ // Normal scene cut check
+ if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+ break;
+ }
+
+ // How fast is prediction quality decaying
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concened with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % 8] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < 8; j++) {
+ decay_accumulator = decay_accumulator * recent_loop_decay[j];
+ }
+
+ // Special check for transition or high motion followed by a
+ // to a static scene.
+ if (detect_transition_to_still(cpi, i,
+ (cpi->key_frame_frequency - i),
+ loop_decay_rate,
+ decay_accumulator)) {
+ break;
+ }
+
+
+ // Step on to the next frame
+ cpi->twopass.frames_to_key++;
+
+ // If we don't have a real key frame within the next two
+ // forcekeyframeevery intervals then break out of the loop.
+ if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+ break;
+ } else
+ cpi->twopass.frames_to_key++;
+
+ i++;
+ }
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural
+ // interval is between 1x and 2x
+ if (cpi->oxcf.auto_key
+ && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {
+ FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
+ FIRSTPASS_STATS tmp_frame;
+
+ cpi->twopass.frames_to_key /= 2;
+
+ // Copy first frame details
+ vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+ // Reset to the start of the group
+ reset_fpf_position(cpi, start_position);
+
+ kf_group_err = 0;
+ kf_group_intra_err = 0;
+ kf_group_coded_err = 0;
+
+ // Rescan to get the correct error data for the forced kf group
+ for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+ // Accumulate kf group errors
+ kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+ kf_group_intra_err += tmp_frame.intra_error;
+ kf_group_coded_err += tmp_frame.coded_error;
+
+ // Load a the next frame's stats
+ input_stats(cpi, &tmp_frame);
+ }
+
+ // Reset to the start of the group
+ reset_fpf_position(cpi, current_pos);
+
+ cpi->next_key_frame_forced = TRUE;
+ } else
+ cpi->next_key_frame_forced = FALSE;
+
+ // Special case for the last frame of the file
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
+ // Accumulate kf group error
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ // These figures keep intra and coded error counts for all frames including key frames in the group.
+ // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+ // Max for a single normal frame (not key frame)
+ int max_bits = frame_max_bits(cpi);
+
+ // Maximum bits for the kf group
+ int64_t max_grp_bits;
+
+ // Default allocation based on bits left and relative
+ // complexity of the section
+ cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
+ (kf_group_err /
+ cpi->twopass.modified_error_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
+ if (cpi->twopass.kf_group_bits > max_grp_bits)
+ cpi->twopass.kf_group_bits = max_grp_bits;
+ } else
+ cpi->twopass.kf_group_bits = 0;
+
+ // Reset the first pass file position
+ reset_fpf_position(cpi, start_position);
+
+ // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ loop_decay_rate = 1.00; // Starting decay rate
+
+ for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+ double r;
+
+ if (EOF == input_stats(cpi, &next_frame))
+ break;
+
+ if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+ r = (IIKFACTOR2 * next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+ else
+ r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
+ if (r > RMAX)
+ r = RMAX;
+
+ // Monitor for static sections.
+ if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
+ zero_motion_accumulator) {
+ zero_motion_accumulator =
+ (next_frame.pcnt_inter - next_frame.pcnt_motion);
+ }
+
+ // How fast is prediction quality decaying
+ if (!detect_flash(cpi, 0)) {
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+ decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+
+ boost_score += (decay_accumulator * r);
+
+ if ((i > MIN_GF_INTERVAL) &&
+ ((boost_score - old_boost_score) < 6.25)) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+ }
+
+ {
+ FIRSTPASS_STATS sectionstats;
+
+ zero_stats(§ionstats);
+ reset_fpf_position(cpi, start_position);
+
+ for (i = 0; i < cpi->twopass.frames_to_key; i++) {
+ input_stats(cpi, &next_frame);
+ accumulate_stats(§ionstats, &next_frame);
+ }
+
+ avg_stats(§ionstats);
+
+ cpi->twopass.section_intra_rating =
+ sectionstats.intra_error
+ / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+ }
+
+ // Reset the first pass file position
+ reset_fpf_position(cpi, start_position);
+
+ // Work out how many bits to allocate for the key frame itself
+ if (1) {
+ int kf_boost = boost_score;
+ int allocation_chunks;
+ int alt_kf_bits;
+
+ if (kf_boost < 300) {
+ kf_boost += (cpi->twopass.frames_to_key * 3);
+ if (kf_boost > 300)
+ kf_boost = 300;
+ }
+
+ if (kf_boost < 250) // Min KF boost
+ kf_boost = 250;
+
+ // Make a note of baseline boost and the zero motion
+ // accumulator value for use elsewhere.
+ cpi->kf_boost = kf_boost;
+ cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // We do three calculations for kf size.
+ // The first is based on the error score for the whole kf group.
+ // The second (optionaly) on the key frames own error if this is
+ // smaller than the average for the group.
+ // The final one insures that the frame receives at least the
+ // allocation it would have received based on its own error score vs
+ // the error score remaining
+ // Special case if the sequence appears almost totaly static
+ // In this case we want to spend almost all of the bits on the
+ // key frame.
+ // cpi->twopass.frames_to_key-1 because key frame itself is taken
+ // care of by kf_boost.
+ if (zero_motion_accumulator >= 0.99) {
+ allocation_chunks =
+ ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+ } else {
+ allocation_chunks =
+ ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+ }
+
+ // Prevent overflow
+ if (kf_boost > 1028) {
+ int divisor = kf_boost >> 10;
+ kf_boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+
+ // Calculate the number of bits to be spent on the key frame
+ cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+
+ // If the key frame is actually easier than the average for the
+ // kf group (which does sometimes happen... eg a blank intro frame)
+ // Then use an alternate calculation based on the kf error score
+ // which should give a smaller key frame.
+ if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {
+ double alt_kf_grp_bits =
+ ((double)cpi->twopass.bits_left *
+ (kf_mod_err * (double)cpi->twopass.frames_to_key) /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+
+ alt_kf_bits = (int)((double)kf_boost *
+ (alt_kf_grp_bits / (double)allocation_chunks));
+
+ if (cpi->twopass.kf_bits > alt_kf_bits) {
+ cpi->twopass.kf_bits = alt_kf_bits;
+ }
+ }
+ // Else if it is much harder than other frames in the group make sure
+ // it at least receives an allocation in keeping with its relative
+ // error score
+ else {
+ alt_kf_bits =
+ (int)((double)cpi->twopass.bits_left *
+ (kf_mod_err /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+
+ if (alt_kf_bits > cpi->twopass.kf_bits) {
+ cpi->twopass.kf_bits = alt_kf_bits;
+ }
+ }
+
+ cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+ cpi->twopass.kf_bits += cpi->min_frame_bandwidth; // Add in the minimum frame allowance
+
+ cpi->per_frame_bandwidth = cpi->twopass.kf_bits; // Peer frame bit target for this frame
+ cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate; // Convert to a per second bitrate
+ }
+
+ // Note the total error score of the kf group minus the key frame itself
+ cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame sizes
+ cpi->twopass.modified_error_left -= kf_group_err;
+}
--- /dev/null
+++ b/vp9/encoder/firstpass.h
@@ -1,0 +1,23 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define __INC_FIRSTPASS_H
+
+extern void vp9_init_first_pass(VP9_COMP *cpi);
+extern void vp9_first_pass(VP9_COMP *cpi);
+extern void vp9_end_first_pass(VP9_COMP *cpi);
+
+extern void vp9_init_second_pass(VP9_COMP *cpi);
+extern void vp9_second_pass(VP9_COMP *cpi);
+extern void vp9_end_second_pass(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/generic/csystemdependent.c
@@ -1,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+
+void vp9_arch_x86_encoder_init(VP9_COMP *cpi);
+void vp9_arch_arm_encoder_init(VP9_COMP *cpi);
+
+void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc,
+ int fraction);
+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc,
+ int fraction);
+
+void vp9_cmachine_specific_config(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ cpi->rtcd.common = &cpi->common.rtcd;
+
+ cpi->rtcd.search.full_search = vp9_full_search_sad;
+ cpi->rtcd.search.refining_search = vp9_refining_search_sad;
+ cpi->rtcd.search.diamond_search = vp9_diamond_search_sad;
+ cpi->rtcd.temporal.apply = vp9_temporal_filter_apply_c;
+#endif
+
+ vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;
+
+#if ARCH_X86 || ARCH_X86_64
+ vp9_arch_x86_encoder_init(cpi);
+#endif
+
+#if ARCH_ARM
+ vp9_arch_arm_encoder_init(cpi);
+#endif
+
+
+}
--- /dev/null
+++ b/vp9/encoder/lookahead.c
@@ -1,0 +1,191 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp9/common/extend.h"
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_ctx {
+ unsigned int max_sz; /* Absolute size of the queue */
+ unsigned int sz; /* Number of buffers currently in the queue */
+ unsigned int read_idx; /* Read index */
+ unsigned int write_idx; /* Write index */
+ struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+ unsigned int *idx) {
+ unsigned int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz)
+ index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+
+void
+vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++)
+ vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+
+struct lookahead_ctx *
+vp9_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int depth) {
+ struct lookahead_ctx *ctx = NULL;
+ int i;
+
+ /* Clamp the lookahead queue depth */
+ if (depth < 1)
+ depth = 1;
+ else if (depth > MAX_LAG_BUFFERS)
+ depth = MAX_LAG_BUFFERS;
+
+ /* Align the buffer dimensions */
+ width = (width + 15) &~15;
+ height = (height + 15) &~15;
+
+ /* Allocate the lookahead structures */
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ ctx->max_sz = depth;
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if (!ctx->buf)
+ goto bail;
+ for (i = 0; i < depth; i++)
+ if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+ width, height, VP8BORDERINPIXELS))
+ goto bail;
+ }
+ return ctx;
+bail:
+ vp9_lookahead_destroy(ctx);
+ return NULL;
+}
+
+
+int
+vp9_lookahead_push(struct lookahead_ctx *ctx,
+ YV12_BUFFER_CONFIG *src,
+ int64_t ts_start,
+ int64_t ts_end,
+ unsigned int flags,
+ unsigned char *active_map) {
+ struct lookahead_entry *buf;
+ int row, col, active_end;
+ int mb_rows = (src->y_height + 15) >> 4;
+ int mb_cols = (src->y_width + 15) >> 4;
+
+ if (ctx->sz + 1 > ctx->max_sz)
+ return 1;
+ ctx->sz++;
+ buf = pop(ctx, &ctx->write_idx);
+
+ // Only do this partial copy if the following conditions are all met:
+ // 1. Lookahead queue has has size of 1.
+ // 2. Active map is provided.
+ // 3. This is not a key frame, golden nor altref frame.
+ if (ctx->max_sz == 1 && active_map && !flags) {
+ for (row = 0; row < mb_rows; ++row) {
+ col = 0;
+
+ while (1) {
+ // Find the first active macroblock in this row.
+ for (; col < mb_cols; ++col) {
+ if (active_map[col])
+ break;
+ }
+
+ // No more active macroblock in this row.
+ if (col == mb_cols)
+ break;
+
+ // Find the end of active region in this row.
+ active_end = col;
+
+ for (; active_end < mb_cols; ++active_end) {
+ if (!active_map[active_end])
+ break;
+ }
+
+ // Only copy this active region.
+ vp9_copy_and_extend_frame_with_rect(src, &buf->img,
+ row << 4,
+ col << 4, 16,
+ (active_end - col) << 4);
+
+ // Start again from the end of this active region.
+ col = active_end;
+ }
+
+ active_map += mb_cols;
+ }
+ } else {
+ vp9_copy_and_extend_frame(src, &buf->img);
+ }
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->flags = flags;
+ return 0;
+}
+
+
+struct lookahead_entry *
+vp9_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain) {
+ struct lookahead_entry *buf = NULL;
+
+ if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
+ buf = pop(ctx, &ctx->read_idx);
+ ctx->sz--;
+ }
+ return buf;
+}
+
+
+struct lookahead_entry *
+vp9_lookahead_peek(struct lookahead_ctx *ctx,
+ int index) {
+ struct lookahead_entry *buf = NULL;
+
+ assert(index < ctx->max_sz);
+ if (index < ctx->sz) {
+ index += ctx->read_idx;
+ if (index >= ctx->max_sz)
+ index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ return buf;
+}
+
+
+unsigned int
+vp9_lookahead_depth(struct lookahead_ctx *ctx) {
+ return ctx->sz;
+}
--- /dev/null
+++ b/vp9/encoder/lookahead.h
@@ -1,0 +1,105 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ unsigned int flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int depth
+ );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] flags Flags set on this frame
+ * \param[in] active_map Map that specifies which macroblock is active
+ */
+int
+vp9_lookahead_push(struct lookahead_ctx *ctx,
+ YV12_BUFFER_CONFIG *src,
+ int64_t ts_start,
+ int64_t ts_end,
+ unsigned int flags,
+ unsigned char *active_map);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry *
+vp9_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry *
+vp9_lookahead_peek(struct lookahead_ctx *ctx,
+ int index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ */
+unsigned int
+vp9_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
--- /dev/null
+++ b/vp9/encoder/mbgraph.c
@@ -1,0 +1,480 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <vp9/encoder/encodeintra.h>
+#include <vp9/encoder/rdopt.h>
+#include <vp9/common/setupintrarecon.h>
+#include <vp9/common/blockd.h>
+#include <vp9/common/reconinter.h>
+#include <vp9/common/systemdependent.h>
+#include <vpx_mem/vpx_mem.h>
+#include <vp9/encoder/segmentation.h>
+
+static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
+ int_mv *ref_mv,
+ int_mv *dst_mv) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &xd->block[0];
+ vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+ unsigned int best_err;
+ int step_param, further_steps;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+ int_mv ref_full;
+
+ // Further step/diamond searches as necessary
+ if (cpi->Speed < 8) {
+ step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
+ further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+ } else {
+ step_param = cpi->sf.first_step + 2;
+ further_steps = 0;
+ }
+
+ vp9_clamp_mv_min_max(x, ref_mv);
+
+ ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
+ ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
+
+ /*cpi->sf.search_method == HEX*/
+ best_err = vp9_hex_search(
+ x, b, d,
+ &ref_full, dst_mv,
+ step_param,
+ x->errorperbit,
+ &v_fn_ptr,
+ NULLMVCOST,
+ NULLMVCOST,
+ ref_mv);
+
+ // Try sub-pixel MC
+ // if (bestsme > error_thresh && bestsme < INT_MAX)
+ {
+ int distortion;
+ unsigned int sse;
+ best_err = cpi->find_fractional_mv_step(
+ x, b, d,
+ dst_mv, ref_mv,
+ x->errorperbit, &v_fn_ptr,
+ NULLMVCOST,
+ & distortion, &sse);
+ }
+
+#if CONFIG_PRED_FILTER
+ // Disable the prediction filter
+ xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+ vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
+ vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+ best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
+ xd->predictor, 16, INT_MAX);
+
+ /* restore UMV window */
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ return best_err;
+}
+
+static int do_16x16_motion_search
+(
+ VP9_COMP *cpi,
+ int_mv *ref_mv,
+ int_mv *dst_mv,
+ YV12_BUFFER_CONFIG *buf,
+ int buf_mb_y_offset,
+ YV12_BUFFER_CONFIG *ref,
+ int mb_y_offset
+) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err, tmp_err;
+ int_mv tmp_mv;
+ int n;
+
+ for (n = 0; n < 16; n++) {
+ BLOCKD *d = &xd->block[n];
+ BLOCK *b = &x->block[n];
+
+ b->base_src = &buf->y_buffer;
+ b->src_stride = buf->y_stride;
+ b->src = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
+
+ d->base_pre = &ref->y_buffer;
+ d->pre_stride = ref->y_stride;
+ d->pre = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
+ }
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
+ xd->pre.y_stride = ref->y_stride;
+ err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+ xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+ dst_mv->as_int = 0;
+
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search
+ tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
+ if (tmp_err < err) {
+ err = tmp_err;
+ dst_mv->as_int = tmp_mv.as_int;
+ }
+
+ // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ if (ref_mv->as_int) {
+ int tmp_err;
+ int_mv zero_ref_mv, tmp_mv;
+
+ zero_ref_mv.as_int = 0;
+ tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
+ if (tmp_err < err) {
+ dst_mv->as_int = tmp_mv.as_int;
+ err = tmp_err;
+ }
+ }
+
+ return err;
+}
+
+static int do_16x16_zerozero_search
+(
+ VP9_COMP *cpi,
+ int_mv *dst_mv,
+ YV12_BUFFER_CONFIG *buf,
+ int buf_mb_y_offset,
+ YV12_BUFFER_CONFIG *ref,
+ int mb_y_offset
+) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err;
+ int n;
+
+ for (n = 0; n < 16; n++) {
+ BLOCKD *d = &xd->block[n];
+ BLOCK *b = &x->block[n];
+
+ b->base_src = &buf->y_buffer;
+ b->src_stride = buf->y_stride;
+ b->src = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
+
+ d->base_pre = &ref->y_buffer;
+ d->pre_stride = ref->y_stride;
+ d->pre = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
+ }
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
+ xd->pre.y_stride = ref->y_stride;
+ // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)
+ err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
+ xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+
+ dst_mv->as_int = 0;
+
+ return err;
+}
+static int find_best_16x16_intra
+(
+ VP9_COMP *cpi,
+ YV12_BUFFER_CONFIG *buf,
+ int mb_y_offset,
+ MB_PREDICTION_MODE *pbest_mode
+) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_PREDICTION_MODE best_mode = -1, mode;
+ int best_err = INT_MAX;
+
+ // calculate SATD for each intra prediction mode;
+ // we're intentionally not doing 4x4, we just want a rough estimate
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ unsigned int err;
+
+ xd->mode_info_context->mbmi.mode = mode;
+ vp9_build_intra_predictors_mby(xd);
+ err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
+ buf->y_stride, best_err);
+ // find best
+ if (err < best_err) {
+ best_err = err;
+ best_mode = mode;
+ }
+ }
+
+ if (pbest_mode)
+ *pbest_mode = best_mode;
+
+ return best_err;
+}
+
+static void update_mbgraph_mb_stats
+(
+ VP9_COMP *cpi,
+ MBGRAPH_MB_STATS *stats,
+ YV12_BUFFER_CONFIG *buf,
+ int mb_y_offset,
+ YV12_BUFFER_CONFIG *golden_ref,
+ int_mv *prev_golden_ref_mv,
+ int gld_y_offset,
+ YV12_BUFFER_CONFIG *alt_ref,
+ int_mv *prev_alt_ref_mv,
+ int arf_y_offset
+) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int intra_error;
+
+ // FIXME in practice we're completely ignoring chroma here
+ xd->dst.y_buffer = buf->y_buffer + mb_y_offset;
+
+ // do intra 16x16 prediction
+ intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);
+ if (intra_error <= 0)
+ intra_error = 1;
+ stats->ref[INTRA_FRAME].err = intra_error;
+
+ // Golden frame MV search, if it exists and is different than last frame
+ if (golden_ref) {
+ int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
+ &stats->ref[GOLDEN_FRAME].m.mv,
+ buf, mb_y_offset,
+ golden_ref, gld_y_offset);
+ stats->ref[GOLDEN_FRAME].err = g_motion_error;
+ } else {
+ stats->ref[GOLDEN_FRAME].err = INT_MAX;
+ stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+ }
+
+ // Alt-ref frame MV search, if it exists and is different than last/golden frame
+ if (alt_ref) {
+ // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,
+ // &stats->ref[ALTREF_FRAME].m.mv,
+ // buf, mb_y_offset,
+ // alt_ref, arf_y_offset);
+
+ int a_motion_error =
+ do_16x16_zerozero_search(cpi,
+ &stats->ref[ALTREF_FRAME].m.mv,
+ buf, mb_y_offset,
+ alt_ref, arf_y_offset);
+
+ stats->ref[ALTREF_FRAME].err = a_motion_error;
+ } else {
+ stats->ref[ALTREF_FRAME].err = INT_MAX;
+ stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+ }
+}
+
+static void update_mbgraph_frame_stats
+(
+ VP9_COMP *cpi,
+ MBGRAPH_FRAME_STATS *stats,
+ YV12_BUFFER_CONFIG *buf,
+ YV12_BUFFER_CONFIG *golden_ref,
+ YV12_BUFFER_CONFIG *alt_ref
+) {
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int mb_col, mb_row, offset = 0;
+ int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+ int_mv arf_top_mv, gld_top_mv;
+ MODE_INFO mi_local;
+
+ // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ arf_top_mv.as_int = 0;
+ gld_top_mv.as_int = 0;
+ x->mv_row_min = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
+ x->mv_row_max = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
+ xd->up_available = 0;
+ xd->dst.y_stride = buf->y_stride;
+ xd->pre.y_stride = buf->y_stride;
+ xd->dst.uv_stride = buf->uv_stride;
+ xd->mode_info_context = &mi_local;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ int_mv arf_left_mv, gld_left_mv;
+ int mb_y_in_offset = mb_y_offset;
+ int arf_y_in_offset = arf_y_offset;
+ int gld_y_in_offset = gld_y_offset;
+
+ // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ arf_left_mv.as_int = arf_top_mv.as_int;
+ gld_left_mv.as_int = gld_top_mv.as_int;
+ x->mv_col_min = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);
+ x->mv_col_max = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;
+ xd->left_available = 0;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+ update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
+ golden_ref, &gld_left_mv, gld_y_in_offset,
+ alt_ref, &arf_left_mv, arf_y_in_offset);
+ arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
+ gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
+ if (mb_col == 0) {
+ arf_top_mv.as_int = arf_left_mv.as_int;
+ gld_top_mv.as_int = gld_left_mv.as_int;
+ }
+ xd->left_available = 1;
+ mb_y_in_offset += 16;
+ gld_y_in_offset += 16;
+ arf_y_in_offset += 16;
+ x->mv_col_min -= 16;
+ x->mv_col_max -= 16;
+ }
+ xd->up_available = 1;
+ mb_y_offset += buf->y_stride * 16;
+ gld_y_offset += golden_ref->y_stride * 16;
+ if (alt_ref)
+ arf_y_offset += alt_ref->y_stride * 16;
+ x->mv_row_min -= 16;
+ x->mv_row_max -= 16;
+ offset += cm->mb_cols;
+ }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_col, mb_row, offset, i;
+ int ncnt[4];
+ int n_frames = cpi->mbgraph_n_frames;
+
+ int *arf_not_zz;
+
+ CHECK_MEM_ERROR(arf_not_zz,
+ vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+ vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));
+
+ // We are not interested in results beyond the alt ref itself.
+ if (n_frames > cpi->frames_till_gf_update_due)
+ n_frames = cpi->frames_till_gf_update_due;
+
+ // defer cost to reference frames
+ for (i = n_frames - 1; i >= 0; i--) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+ for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+ offset += cm->mb_cols, mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats =
+ &frame_stats->mb_stats[offset + mb_col];
+
+ int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+ int intra_err = mb_stats->ref[INTRA_FRAME ].err;
+ int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+ // Test for altref vs intra and gf and that its mv was 0,0.
+ if ((altref_err > 1000) ||
+ (altref_err > intra_err) ||
+ (altref_err > golden_err)) {
+ arf_not_zz[offset + mb_col]++;
+ }
+ }
+ }
+ }
+
+ vpx_memset(ncnt, 0, sizeof(ncnt));
+ for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+ offset += cm->mb_cols, mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ // If any of the blocks in the sequence failed then the MB
+ // goes in segment 0
+ if (arf_not_zz[offset + mb_col]) {
+ ncnt[0]++;
+ cpi->segmentation_map[offset + mb_col] = 0;
+ } else {
+ ncnt[1]++;
+ cpi->segmentation_map[offset + mb_col] = 1;
+ }
+ }
+ }
+
+ // Only bother with segmentation if over 10% of the MBs in static segment
+ // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+ if (1) {
+ // Note % of blocks that are marked as static
+ if (cm->MBs)
+ cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;
+
+ // This error case should not be reachable as this function should
+ // never be called with the common data structure unititialized.
+ else
+ cpi->static_mb_pct = 0;
+
+ cpi->seg0_cnt = ncnt[0];
+ vp9_enable_segmentation((VP9_PTR) cpi);
+ } else {
+ cpi->static_mb_pct = 0;
+ vp9_disable_segmentation((VP9_PTR) cpi);
+ }
+
+ // Free localy allocated storage
+ vpx_free(arf_not_zz);
+}
+
+void vp9_update_mbgraph_stats
+(
+ VP9_COMP *cpi
+) {
+ VP9_COMMON *const cm = &cpi->common;
+ int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
+ YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];
+
+ // we need to look ahead beyond where the ARF transitions into
+ // being a GF - so exit if we don't look ahead beyond that
+ if (n_frames <= cpi->frames_till_gf_update_due)
+ return;
+ if (n_frames > cpi->common.frames_till_alt_ref_frame)
+ n_frames = cpi->common.frames_till_alt_ref_frame;
+ if (n_frames > MAX_LAG_BUFFERS)
+ n_frames = MAX_LAG_BUFFERS;
+
+ cpi->mbgraph_n_frames = n_frames;
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ vpx_memset(frame_stats->mb_stats, 0,
+ cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ }
+
+ // do motion search to find contribution of each reference to data
+ // later on in this GF group
+ // FIXME really, the GF/last MC search should be done forward, and
+ // the ARF MC search backwards, to get optimal results for MV caching
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ struct lookahead_entry *q_cur =
+ vp9_lookahead_peek(cpi->lookahead, i);
+
+ assert(q_cur != NULL);
+
+ update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
+ golden_ref, cpi->Source);
+ }
+
+ vp9_clear_system_state(); // __asm emms;
+
+ separate_arf_mbs(cpi);
+}
--- /dev/null
+++ b/vp9/encoder/mbgraph.h
@@ -1,0 +1,16 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_MBGRAPH_H__
+#define __INC_MBGRAPH_H__ 1
+
+extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
+
+#endif /* __INC_MBGRAPH_H__ */
--- /dev/null
+++ b/vp9/encoder/mcomp.c
@@ -1,0 +1,2203 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/onyx_int.h"
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/config.h"
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "vp9/common/findnearmv.h"
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
+ int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
+ ((ref_mv->as_mv.col & 7) ? 1 : 0);
+ int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
+ ((ref_mv->as_mv.row & 7) ? 1 : 0);
+ int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+ /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+ if (x->mv_col_min < col_min)
+ x->mv_col_min = col_min;
+ if (x->mv_col_max > col_max)
+ x->mv_col_max = col_max;
+ if (x->mv_row_min < row_min)
+ x->mv_row_min = row_min;
+ if (x->mv_row_max > row_max)
+ x->mv_row_max = row_max;
+}
+
+int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+ int Weight, int ishp) {
+ MV v;
+ v.row = (mv->as_mv.row - ref->as_mv.row);
+ v.col = (mv->as_mv.col - ref->as_mv.col);
+ return ((mvjcost[vp9_get_mv_joint(v)] +
+ mvcost[0][v.row] + mvcost[1][v.col]) *
+ Weight) >> 7;
+}
+
+static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+ int error_per_bit, int ishp) {
+ if (mvcost) {
+ MV v;
+ v.row = (mv->as_mv.row - ref->as_mv.row);
+ v.col = (mv->as_mv.col - ref->as_mv.col);
+ return ((mvjcost[vp9_get_mv_joint(v)] +
+ mvcost[0][v.row] + mvcost[1][v.col]) *
+ error_per_bit + 128) >> 8;
+ }
+ return 0;
+}
+
+static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,
+ int error_per_bit) {
+
+ if (mvsadcost) {
+ MV v;
+ v.row = (mv->as_mv.row - ref->as_mv.row);
+ v.col = (mv->as_mv.col - ref->as_mv.col);
+ return ((mvjsadcost[vp9_get_mv_joint(v)] +
+ mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
+ error_per_bit + 128) >> 8;
+ }
+ return 0;
+}
+
+void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
+ int Len;
+ int search_site_count = 0;
+
+
+ // Generate offsets for 4 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0) {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 4;
+}
+
+void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
+ int Len;
+ int search_site_count = 0;
+
+ // Generate offsets for 8 search sites per step.
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0) {
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride + Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride - Len;
+ search_site_count++;
+
+ // Compute offsets for search sites.
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride + Len;
+ search_site_count++;
+
+ // Contract.
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r, c) \
+ (mvcost ? \
+ ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + \
+ mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+ error_per_bit + 128) >> 8 : 0)
+
+#define SP(x) (((x) & 7) << 1) // convert motion vector component to offset
+ // for svf calc
+
+#define IFMVCV(r, c, s, e) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
+ s \
+ else \
+ e;
+
+/* pointer to predictor base of a motionvector */
+#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
+
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+ IFMVCV(r, c, { \
+ thismse = (DIST(r, c)); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ }, \
+ v = INT_MAX;)
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ DEC_MVCOSTS,
+ int *distortion,
+ unsigned int *sse1) {
+ unsigned char *z = (*(b->base_src) + b->src);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int rr, rc, br, bc, hstep;
+ int tr, tc;
+ unsigned int besterr = INT_MAX;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+ unsigned int eighthiters = 4;
+ int thismse;
+ int maxc, minc, maxr, minr;
+ int y_stride;
+ int offset;
+ int usehp = xd->allow_high_precision_mv;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+ unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+ int buf_r1, buf_r2, buf_c1, buf_c2;
+
+ // Clamping to avoid out-of-range data access
+ buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?
+ (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;
+ buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?
+ (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;
+ buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?
+ (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;
+ buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?
+ (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;
+ y_stride = 32;
+
+ /* Copy to intermediate buffer before searching. */
+ vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);
+ y = xd->y_buf + y_stride * buf_r1 + buf_c1;
+#else
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ y_stride = d->pre_stride;
+#endif
+
+ rr = ref_mv->as_mv.row;
+ rc = ref_mv->as_mv.col;
+ br = bestmv->as_mv.row << 3;
+ bc = bestmv->as_mv.col << 3;
+ hstep = 4;
+ minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
+ maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
+ minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
+ maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
+
+ tr = br;
+ tc = bc;
+
+
+ offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+
+ // calculate central point error
+ besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,
+ error_per_bit, xd->allow_high_precision_mv);
+
+ // TODO: Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 ( if diag selected)
+ while (--halfiters) {
+ // 1/2 pel
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ // TODO: Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ hstep >>= 1;
+ while (--quarteriters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ if (xd->allow_high_precision_mv) {
+ usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ } else {
+ usehp = 0;
+ }
+
+ if (usehp) {
+ hstep >>= 1;
+ while (--eighthiters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+ }
+ bestmv->as_mv.row = br;
+ bestmv->as_mv.col = bc;
+
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+#undef MVC
+#undef PRE
+#undef DIST
+#undef IFMVCV
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+
+int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ DEC_MVCOSTS, int *distortion,
+ unsigned int *sse1) {
+ int bestmse = INT_MAX;
+ int_mv startmv;
+ int_mv this_mv;
+ int_mv orig_mv;
+ int yrow_movedback = 0, ycol_movedback = 0;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir;
+ int thismse;
+ int y_stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int usehp = xd->allow_high_precision_mv;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+ unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+
+ y_stride = 32;
+ /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+ vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+ y = xd->y_buf + y_stride + 1;
+#else
+ unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ y_stride = d->pre_stride;
+#endif
+
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+ startmv = *bestmv;
+ orig_mv = *bestmv;
+
+ // calculate central point error
+ bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = bestmse;
+ bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ // go left then right and check error
+ this_mv.as_mv.row = startmv.as_mv.row;
+ this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+ thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (left < bestmse) {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 8;
+ thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (right < bestmse) {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // go up then down and check error
+ this_mv.as_mv.col = startmv.as_mv.col;
+ this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+ thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (up < bestmse) {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 8;
+ thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (down < bestmse) {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ // for(whichdir =0;whichdir<4;whichdir++)
+ // {
+ this_mv = startmv;
+
+ switch (whichdir) {
+ case 0:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ break;
+ case 3:
+ default:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (diag < bestmse) {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+// }
+
+
+ // time to check quarter pels.
+ if (bestmv->as_mv.row < startmv.as_mv.row) {
+ y -= y_stride;
+ yrow_movedback = 1;
+ }
+
+ if (bestmv->as_mv.col < startmv.as_mv.col) {
+ y--;
+ ycol_movedback = 1;
+ }
+
+ startmv = *bestmv;
+
+
+
+ // go left then right and check error
+ this_mv.as_mv.row = startmv.as_mv.row;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col = startmv.as_mv.col - 2;
+ thismse = vfp->svf(y, y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
+ b->src_stride, &sse);
+ }
+
+ left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (left < bestmse) {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 4;
+ thismse = vfp->svf(y, y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (right < bestmse) {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // go up then down and check error
+ this_mv.as_mv.col = startmv.as_mv.col;
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row = startmv.as_mv.row - 2;
+ thismse = vfp->svf(y, y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
+ z, b->src_stride, &sse);
+ }
+
+ up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (up < bestmse) {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (down < bestmse) {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+// for(whichdir=0;whichdir<4;whichdir++)
+// {
+ this_mv = startmv;
+
+ switch (whichdir) {
+ case 0:
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row -= 2;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+ }
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);
+ }
+ }
+
+ break;
+ case 1:
+ this_mv.as_mv.col += 2;
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row -= 2;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+ }
+
+ break;
+ case 2:
+ this_mv.as_mv.row += 2;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
+ b->src_stride, &sse);
+ }
+
+ break;
+ case 3:
+ this_mv.as_mv.col += 2;
+ this_mv.as_mv.row += 2;
+ thismse = vfp->svf(y, y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (diag < bestmse) {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ if (x->e_mbd.allow_high_precision_mv) {
+ usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ } else {
+ usehp = 0;
+ }
+ if (!usehp)
+ return bestmse;
+
+ /* Now do 1/8th pixel */
+ if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {
+ y -= y_stride;
+ yrow_movedback = 1;
+ }
+
+ if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {
+ y--;
+ ycol_movedback = 1;
+ }
+
+ startmv = *bestmv;
+
+ // go left then right and check error
+ this_mv.as_mv.row = startmv.as_mv.row;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col = startmv.as_mv.col - 1;
+ thismse = vfp->svf(y, y_stride,
+ SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+ thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ }
+
+ left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (left < bestmse) {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 2;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+ z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (right < bestmse) {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // go up then down and check error
+ this_mv.as_mv.col = startmv.as_mv.col;
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row = startmv.as_mv.row - 1;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+ }
+
+ up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (up < bestmse) {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 2;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (down < bestmse) {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // now check 1 more diagonal
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+// for(whichdir=0;whichdir<4;whichdir++)
+// {
+ this_mv = startmv;
+
+ switch (whichdir) {
+ case 0:
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row -= 1;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 1;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+ thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+ }
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 1;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+ thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);
+ }
+ }
+
+ break;
+ case 1:
+ this_mv.as_mv.col += 1;
+
+ if (startmv.as_mv.row & 7) {
+ this_mv.as_mv.row -= 1;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
+ thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+ }
+
+ break;
+ case 2:
+ this_mv.as_mv.row += 1;
+
+ if (startmv.as_mv.col & 7) {
+ this_mv.as_mv.col -= 1;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ } else {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
+ thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ }
+
+ break;
+ case 3:
+ this_mv.as_mv.col += 1;
+ this_mv.as_mv.row += 1;
+ thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (diag < bestmse) {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ return bestmse;
+}
+
+#undef SP
+
+int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ DEC_MVCOSTS,
+ int *distortion,
+ unsigned int *sse1) {
+ int bestmse = INT_MAX;
+ int_mv startmv;
+ int_mv this_mv;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir;
+ int thismse;
+ int y_stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
+ unsigned char *y0 = *(d->base_pre) + d->pre +
+ (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+
+ y_stride = 32;
+ /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+ vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
+ y = xd->y_buf + y_stride + 1;
+#else
+ unsigned char *y = *(d->base_pre) + d->pre +
+ (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+ y_stride = d->pre_stride;
+#endif
+
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+ startmv = *bestmv;
+
+ // calculate central point error
+ bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = bestmse;
+ bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ // go left then right and check error
+ this_mv.as_mv.row = startmv.as_mv.row;
+ this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+ thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (left < bestmse) {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 8;
+ thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (right < bestmse) {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // go up then down and check error
+ this_mv.as_mv.col = startmv.as_mv.col;
+ this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+ thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (up < bestmse) {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 8;
+ thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (down < bestmse) {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ // now check 1 more diagonal -
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ this_mv = startmv;
+
+ switch (whichdir) {
+ case 0:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ break;
+ case 3:
+ default:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,
+ xd->allow_high_precision_mv);
+
+ if (diag < bestmse) {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ return bestmse;
+}
+
+#define CHECK_BOUNDS(range) \
+ {\
+ all_in = 1;\
+ all_in &= ((br-range) >= x->mv_row_min);\
+ all_in &= ((br+range) <= x->mv_row_max);\
+ all_in &= ((bc-range) >= x->mv_col_min);\
+ all_in &= ((bc+range) <= x->mv_col_max);\
+ }
+
+#define CHECK_POINT \
+ {\
+ if (this_mv.as_mv.col < x->mv_col_min) continue;\
+ if (this_mv.as_mv.col > x->mv_col_max) continue;\
+ if (this_mv.as_mv.row < x->mv_row_min) continue;\
+ if (this_mv.as_mv.row > x->mv_row_max) continue;\
+ }
+
+#define CHECK_BETTER \
+ {\
+ if (thissad < bestsad)\
+ {\
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\
+ if (thissad < bestsad)\
+ {\
+ bestsad = thissad;\
+ best_site = i;\
+ }\
+ }\
+ }
+
+static const MV next_chkpts[6][3] = {
+ {{ -2, 0}, { -1, -2}, {1, -2}},
+ {{ -1, -2}, {1, -2}, {2, 0}},
+ {{1, -2}, {2, 0}, {1, 2}},
+ {{2, 0}, {1, 2}, { -1, 2}},
+ {{1, 2}, { -1, 2}, { -2, 0}},
+ {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+
+int vp9_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int sad_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ DEC_MVSADCOSTS,
+ DEC_MVCOSTS,
+ int_mv *center_mv
+) {
+ MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
+ MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
+ int i, j;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ int in_what_stride = d->pre_stride;
+ int br, bc;
+ int_mv this_mv;
+ unsigned int bestsad = 0x7fffffff;
+ unsigned int thissad;
+ unsigned char *base_offset;
+ unsigned char *this_offset;
+ int k = -1;
+ int all_in;
+ int best_site = -1;
+
+ int_mv fcenter_mv;
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ br = ref_mv->as_mv.row;
+ bc = ref_mv->as_mv.col;
+
+ // Work out the start point for the search
+ base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
+ this_offset = base_offset + (br * (d->pre_stride)) + bc;
+ this_mv.as_mv.row = br;
+ this_mv.as_mv.col = bc;
+ bestsad = vfp->sdf(what, what_stride, this_offset,
+ in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // hex search
+ // j=0
+ CHECK_BOUNDS(2)
+
+ if (all_in) {
+ for (i = 0; i < 6; i++) {
+ this_mv.as_mv.row = br + hex[i].row;
+ this_mv.as_mv.col = bc + hex[i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < 6; i++) {
+ this_mv.as_mv.row = br + hex[i].row;
+ this_mv.as_mv.col = bc + hex[i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ goto cal_neighbors;
+ else {
+ br += hex[best_site].row;
+ bc += hex[best_site].col;
+ k = best_site;
+ }
+
+ for (j = 1; j < 127; j++) {
+ best_site = -1;
+ CHECK_BOUNDS(2)
+
+ if (all_in) {
+ for (i = 0; i < 3; i++) {
+ this_mv.as_mv.row = br + next_chkpts[k][i].row;
+ this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < 3; i++) {
+ this_mv.as_mv.row = br + next_chkpts[k][i].row;
+ this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else {
+ br += next_chkpts[k][best_site].row;
+ bc += next_chkpts[k][best_site].col;
+ k += 5 + best_site;
+ if (k >= 12) k -= 12;
+ else if (k >= 6) k -= 6;
+ }
+ }
+
+ // check 4 1-away neighbors
+cal_neighbors:
+ for (j = 0; j < 32; j++) {
+ best_site = -1;
+ CHECK_BOUNDS(1)
+
+ if (all_in) {
+ for (i = 0; i < 4; i++) {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else {
+ br += neighbors[best_site].row;
+ bc += neighbors[best_site].col;
+ }
+ }
+
+ best_mv->as_mv.row = br;
+ best_mv->as_mv.col = bc;
+
+ return bestsad;
+}
+#undef CHECK_BOUNDS
+#undef CHECK_POINT
+#undef CHECK_BETTER
+
+int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *ref_mv, int_mv *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+ int_mv *center_mv) {
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ int_mv this_mv;
+
+ int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row, ref_col;
+ int this_row_offset, this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ int thissad;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ ref_row = ref_mv->as_mv.row;
+ ref_col = ref_mv->as_mv.col;
+ *num00 = 0;
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what,
+ in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ for (j = 0; j < x->searches_per_step; j++) {
+ // Trap illegal vectors
+ this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+ {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site) {
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ } else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return
+ fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+}
+
+int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *ref_mv, int_mv *best_mv, int search_param,
+ int sad_per_bit, int *num00,
+ vp9_variance_fn_ptr_t *fn_ptr,
+ DEC_MVCOSTS, int_mv *center_mv) {
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ int_mv this_mv;
+
+ int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row;
+ int ref_col;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ ref_row = ref_mv->as_mv.row;
+ ref_col = ref_mv->as_mv.col;
+ *num00 = 0;
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ // Work out the start point for the search
+ in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+ best_address = in_what;
+
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride,
+ in_what, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // search_param determines the length of the initial step and hence the number of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ int all_in = 1, t;
+
+ // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
+ // checking 4 bounds for each points.
+ all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
+ all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
+ all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
+ all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+
+ if (all_in) {
+ unsigned int sad_array[4];
+
+ for (j = 0; j < x->searches_per_step; j += 4) {
+ unsigned char *block_offset[4];
+
+ for (t = 0; t < 4; t++)
+ block_offset[t] = ss[i + t].offset + best_address;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
+
+ for (t = 0; t < 4; t++, i++) {
+ if (sad_array[t] < bestsad) {
+ this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
+ this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
+ sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (sad_array[t] < bestsad) {
+ bestsad = sad_array[t];
+ best_site = i;
+ }
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < x->searches_per_step; j++) {
+ // Trap illegal vectors
+ this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+
+ if (best_site != last_site) {
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ } else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ if (bestsad == INT_MAX)
+ return INT_MAX;
+
+ return
+ fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+ point as the best match, we will do a final 1-away diamond
+ refining search */
+int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
+ BLOCKD *d, int_mv *mvp_full, int step_param,
+ int sadpb, int further_steps,
+ int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
+ int_mv *ref_mv, int_mv *dst_mv) {
+ int_mv temp_mv;
+ int thissme, n, num00;
+ int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+ step_param, sadpb, &num00,
+ fn_ptr, XMVCOST, ref_mv);
+ dst_mv->as_int = temp_mv.as_int;
+
+ n = num00;
+ num00 = 0;
+
+ /* If there won't be more n-step search, check to see if refining search is needed. */
+ if (n > further_steps)
+ do_refine = 0;
+
+ while (n < further_steps) {
+ n++;
+
+ if (num00)
+ num00--;
+ else {
+ thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+ step_param + n, sadpb, &num00,
+ fn_ptr, XMVCOST, ref_mv);
+
+ /* check to see if refining search is needed. */
+ if (num00 > (further_steps - n))
+ do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ dst_mv->as_int = temp_mv.as_int;
+ }
+ }
+ }
+
+ /* final 1-away diamond refining search */
+ if (do_refine == 1) {
+ int search_range = 8;
+ int_mv best_mv;
+ best_mv.as_int = dst_mv->as_int;
+ thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,
+ fn_ptr, XMVCOST, ref_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ dst_mv->as_int = best_mv.as_int;
+ }
+ }
+ return bestsme;
+}
+
+int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+ int_mv *center_mv) {
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.as_mv.first;
+ int_mv this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ int thissad;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+ in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max; r++) {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+
+ for (c = col_min; c < col_max; c++) {
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+
+ check_here++;
+ }
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX)
+ return
+ fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+ else
+ return INT_MAX;
+}
+
+int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+ int_mv *center_mv) {
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.as_mv.first;
+ int_mv this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ unsigned int sad_array[3];
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride,
+ bestaddress, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max; r++) {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 2) < col_max) {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++) {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max) {
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX)
+ return
+ fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+ else
+ return INT_MAX;
+}
+
+int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp9_variance_fn_ptr_t *fn_ptr,
+ DEC_MVCOSTS,
+ int_mv *center_mv) {
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int in_what_stride = d->pre_stride;
+ int mv_stride = d->pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.as_mv.first;
+ int_mv this_mv;
+ int bestsad = INT_MAX;
+ int r, c;
+
+ unsigned char *check_here;
+ unsigned int thissad;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+ unsigned int sad_array[3];
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ // Work out the mid point for the search
+ in_what = *(d->base_pre) + d->pre;
+ bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ // Baseline value at the centre
+ bestsad = fn_ptr->sdf(what, what_stride,
+ bestaddress, in_what_stride, 0x7fffffff)
+ + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);
+
+ // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max; r++) {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 7) < col_max) {
+ int i;
+
+ fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
+
+ for (i = 0; i < 8; i++) {
+ thissad = (unsigned int)sad_array8[i];
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while ((c + 2) < col_max) {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++) {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max) {
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ MVSADCOSTS, sad_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX)
+ return
+ fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+ else
+ return INT_MAX;
+}
+
+int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int error_per_bit, int search_range,
+ vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,
+ int_mv *center_mv) {
+ MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+ int i, j;
+ short this_row_offset, this_col_offset;
+
+ int what_stride = b->src_stride;
+ int in_what_stride = d->pre_stride;
+ unsigned char *what = (*(b->base_src) + b->src);
+ unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
+ (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+ unsigned char *check_here;
+ unsigned int thissad;
+ int_mv this_mv;
+ unsigned int bestsad = INT_MAX;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
+ mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 4; j++) {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX)
+ return
+ fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+ else
+ return INT_MAX;
+}
+
+int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ DEC_MVCOSTS, int_mv *center_mv) {
+ MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+ int i, j;
+ short this_row_offset, this_col_offset;
+
+ int what_stride = b->src_stride;
+ int in_what_stride = d->pre_stride;
+ unsigned char *what = (*(b->base_src) + b->src);
+ unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
+ (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
+ unsigned char *check_here;
+ unsigned int thissad;
+ int_mv this_mv;
+ unsigned int bestsad = INT_MAX;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +
+ mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+ int all_in = 1;
+
+ all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
+ all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
+ all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
+ all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+
+ if (all_in) {
+ unsigned int sad_array[4];
+ unsigned char *block_offset[4];
+ block_offset[0] = best_address - in_what_stride;
+ block_offset[1] = best_address - 1;
+ block_offset[2] = best_address + 1;
+ block_offset[3] = best_address + in_what_stride;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+ for (j = 0; j < 4; j++) {
+ if (sad_array[j] < bestsad) {
+ this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
+ this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
+ sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+ if (sad_array[j] < bestsad) {
+ bestsad = sad_array[j];
+ best_site = j;
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < 4; j++) {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX)
+ return
+ fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,
+ xd->allow_high_precision_mv);
+ else
+ return INT_MAX;
+}
+
+
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void) {
+ FILE *f = fopen("modecont.c", "a");
+ int i, j;
+
+ fprintf(f, "#include \"entropy.h\"\n");
+ fprintf(f, "const int vp9_mode_contexts[6][4] =");
+ fprintf(f, "{\n");
+ for (j = 0; j < 6; j++) {
+ fprintf(f, " {/* %d */ ", j);
+ fprintf(f, " ");
+ for (i = 0; i < 4; i++) {
+ int this_prob;
+ int count;
+
+ // context probs
+ count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+ if (count)
+ this_prob = 256 * mv_ref_ct[j][i][0] / count;
+ else
+ this_prob = 128;
+
+ if (this_prob == 0)
+ this_prob = 1;
+ fprintf(f, "%5d, ", this_prob);
+ }
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+void init_mv_ref_counts() {
+ vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+ vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
+ if (m == ZEROMV) {
+ ++mv_ref_ct [ct[0]] [0] [0];
+ ++mv_mode_cts[0][0];
+ } else {
+ ++mv_ref_ct [ct[0]] [0] [1];
+ ++mv_mode_cts[0][1];
+
+ if (m == NEARESTMV) {
+ ++mv_ref_ct [ct[1]] [1] [0];
+ ++mv_mode_cts[1][0];
+ } else {
+ ++mv_ref_ct [ct[1]] [1] [1];
+ ++mv_mode_cts[1][1];
+
+ if (m == NEARMV) {
+ ++mv_ref_ct [ct[2]] [2] [0];
+ ++mv_mode_cts[2][0];
+ } else {
+ ++mv_ref_ct [ct[2]] [2] [1];
+ ++mv_mode_cts[2][1];
+
+ if (m == NEWMV) {
+ ++mv_ref_ct [ct[3]] [3] [0];
+ ++mv_mode_cts[3][0];
+ } else {
+ ++mv_ref_ct [ct[3]] [3] [1];
+ ++mv_mode_cts[3][1];
+ }
+ }
+ }
+ }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
--- /dev/null
+++ b/vp9/encoder/mcomp.h
@@ -1,0 +1,159 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "variance.h"
+
+#define MVCOSTS mvjcost, mvcost
+#define MVSADCOSTS mvjsadcost, mvsadcost
+#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]
+#define NULLMVCOST NULL, NULL
+#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Max full pel mv specified in 1 pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units
+
+extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
+extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,
+ int Weight, int ishp);
+extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp9_init3smotion_compensation(MACROBLOCK *x, int stride);
+// Runs sequence of diamond searches in smaller steps for RD
+struct VP9_COMP;
+int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
+ BLOCKD *d, int_mv *mvp_full, int step_param,
+ int sadpb, int further_steps, int do_refine,
+ vp9_variance_fn_ptr_t *fn_ptr,
+ int_mv *ref_mv, int_mv *dst_mv);
+
+extern int vp9_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vf,
+ DEC_MVSADCOSTS,
+ DEC_MVCOSTS,
+ int_mv *center_mv
+);
+
+typedef int (fractional_mv_step_fp)
+(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,
+ int *distortion, unsigned int *sse);
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
+
+#define prototype_full_search_sad(sym)\
+ int (sym)\
+ (\
+ MACROBLOCK *x, \
+ BLOCK *b, \
+ BLOCKD *d, \
+ int_mv *ref_mv, \
+ int sad_per_bit, \
+ int distance, \
+ vp9_variance_fn_ptr_t *fn_ptr, \
+ DEC_MVSADCOSTS, \
+ int_mv *center_mv \
+ )
+
+#define prototype_refining_search_sad(sym)\
+ int (sym)\
+ (\
+ MACROBLOCK *x, \
+ BLOCK *b, \
+ BLOCKD *d, \
+ int_mv *ref_mv, \
+ int sad_per_bit, \
+ int distance, \
+ vp9_variance_fn_ptr_t *fn_ptr, \
+ DEC_MVSADCOSTS, \
+ int_mv *center_mv \
+ )
+
+#define prototype_diamond_search_sad(sym)\
+ int (sym)\
+ (\
+ MACROBLOCK *x, \
+ BLOCK *b, \
+ BLOCKD *d, \
+ int_mv *ref_mv, \
+ int_mv *best_mv, \
+ int search_param, \
+ int sad_per_bit, \
+ int *num00, \
+ vp9_variance_fn_ptr_t *fn_ptr, \
+ DEC_MVSADCOSTS, \
+ int_mv *center_mv \
+ )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/mcomp_x86.h"
+#endif
+
+typedef prototype_full_search_sad(*vp9_full_search_fn_t);
+extern prototype_full_search_sad(vp9_full_search_sad);
+extern prototype_full_search_sad(vp9_full_search_sadx3);
+extern prototype_full_search_sad(vp9_full_search_sadx8);
+
+typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);
+extern prototype_refining_search_sad(vp9_refining_search_sad);
+extern prototype_refining_search_sad(vp9_refining_search_sadx4);
+
+typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);
+extern prototype_diamond_search_sad(vp9_diamond_search_sad);
+extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);
+
+#ifndef vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sad
+#endif
+extern prototype_full_search_sad(vp9_search_full_search);
+
+#ifndef vp9_search_refining_search
+#define vp9_search_refining_search vp9_refining_search_sad
+#endif
+extern prototype_refining_search_sad(vp9_search_refining_search);
+
+#ifndef vp9_search_diamond_search
+#define vp9_search_diamond_search vp9_diamond_search_sad
+#endif
+extern prototype_diamond_search_sad(vp9_search_diamond_search);
+
+typedef struct {
+ prototype_full_search_sad(*full_search);
+ prototype_refining_search_sad(*refining_search);
+ prototype_diamond_search_sad(*diamond_search);
+} vp9_search_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn
+#endif
+
+#endif
--- /dev/null
+++ b/vp9/encoder/modecosts.c
@@ -1,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "vp9/common/entropymode.h"
+
+
+void vp9_init_mode_costs(VP9_COMP *c) {
+ VP9_COMMON *x = &c->common;
+ const vp9_tree_p T = vp9_bmode_tree;
+ int i, j;
+
+ for (i = 0; i < VP9_BINTRAMODES; i++) {
+ for (j = 0; j < VP9_BINTRAMODES; j++) {
+ vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
+ x->kf_bmode_prob[i][j], T);
+ }
+ }
+
+ vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
+ vp9_cost_tokens((int *)c->mb.inter_bmode_costs,
+ x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
+
+ vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
+ vp9_cost_tokens(c->mb.mbmode_cost[0],
+ x->kf_ymode_prob[c->common.kf_ymode_probs_index],
+ vp9_kf_ymode_tree);
+ vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
+ x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+ vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
+ x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+ vp9_cost_tokens(c->mb.i8x8_mode_costs,
+ x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
+
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
+ vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
+ x->fc.switchable_interp_prob[i],
+ vp9_switchable_interp_tree);
+}
--- /dev/null
+++ b/vp9/encoder/modecosts.h
@@ -1,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp9_init_mode_costs(VP9_COMP *x);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/onyx_if.c
@@ -1,0 +1,4486 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp9/common/systemdependent.h"
+#include "quantize.h"
+#include "vp9/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/extend.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_scale/yv12extend.h"
+#if CONFIG_POSTPROC
+#include "vp9/common/postproc.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include "vpx_ports/vpx_timer.h"
+#include "temporal_filter.h"
+
+#include "vp9/common/seg_common.h"
+#include "mbgraph.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/encoder/rdopt.h"
+#include "bitstream.h"
+#include "ratectrl.h"
+
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#define RTCD(x) &cpi->common.rtcd.x
+#else
+#define IF_RTCD(x) NULL
+#define RTCD(x) NULL
+#endif
+
+extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
+
+extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);
+
+extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);
+
+extern void vp9_cmachine_specific_config(VP9_COMP *cpi);
+
+extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int filt_lvl, int low_var_thresh, int flag);
+
+extern void print_tree_update_probs();
+
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc);
+
+extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);
+
+static void set_default_lf_deltas(VP9_COMP *cpi);
+
+#define DEFAULT_INTERP_FILTER EIGHTTAP /* SWITCHABLE for better performance */
+#define SEARCH_BEST_FILTER 0 /* to search exhaustively for
+ best filter */
+#define RESET_FOREACH_FILTER 0 /* whether to reset the encoder state
+ before trying each new filter */
+#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv
+ for altref computation */
+#define HIGH_PRECISION_MV_QTHRESH 200 /* Q threshold for use of high precision
+ mv. Choose a very high value for
+ now so that HIGH_PRECISION is always
+ chosen */
+
+#if CONFIG_INTERNAL_STATS
+#include "math.h"
+
+extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest, int lumamask,
+ double *weight);
+
+
+extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v);
+
+
+#endif
+
+// #define OUTPUT_YUV_REC
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];
+#endif
+
+#ifdef NMV_STATS
+extern void init_nmvstats();
+extern void print_nmvstats();
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+extern unsigned __int64 Sectionbits[500];
+#endif
+#ifdef MODE_STATS
+extern INT64 Sectionbits[500];
+extern unsigned int y_modes[VP9_YMODES];
+extern unsigned int i8x8_modes[VP9_I8X8_MODES];
+extern unsigned int uv_modes[VP9_UV_MODES];
+extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
+extern unsigned int b_modes[B_MODE_COUNT];
+extern unsigned int inter_y_modes[MB_MODE_COUNT];
+extern unsigned int inter_uv_modes[VP9_UV_MODES];
+extern unsigned int inter_b_modes[B_MODE_COUNT];
+#endif
+
+extern void vp9_init_quantizer(VP9_COMP *cpi);
+
+static int base_skip_false_prob[QINDEX_RANGE][3];
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq[QINDEX_RANGE];
+static int kf_high_motion_minq[QINDEX_RANGE];
+static int gf_low_motion_minq[QINDEX_RANGE];
+static int gf_high_motion_minq[QINDEX_RANGE];
+static int inter_minq[QINDEX_RANGE];
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int calculate_minq_index(double maxq,
+ double x3, double x2, double x, double c) {
+ int i;
+ double minqtarget;
+ double thisq;
+
+ minqtarget = ((x3 * maxq * maxq * maxq) +
+ (x2 * maxq * maxq) +
+ (x * maxq) +
+ c);
+
+ if (minqtarget > maxq)
+ minqtarget = maxq;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ thisq = vp9_convert_qindex_to_q(i);
+ if (minqtarget <= vp9_convert_qindex_to_q(i))
+ return i;
+ }
+ return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(void) {
+ int i;
+ double maxq;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ maxq = vp9_convert_qindex_to_q(i);
+
+
+ kf_low_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000003,
+ -0.000015,
+ 0.074,
+ 0.0);
+ kf_high_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000004,
+ -0.000125,
+ 0.14,
+ 0.0);
+ gf_low_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000015,
+ -0.0009,
+ 0.33,
+ 0.0);
+ gf_high_motion_minq[i] = calculate_minq_index(maxq,
+ 0.0000021,
+ -0.00125,
+ 0.45,
+ 0.0);
+ inter_minq[i] = calculate_minq_index(maxq,
+ 0.00000271,
+ -0.00113,
+ 0.697,
+ 0.0);
+
+ }
+}
+
+static void init_base_skip_probs(void) {
+ int i;
+ double q;
+ int skip_prob, t;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ q = vp9_convert_qindex_to_q(i);
+
+ // Exponential decay caluclation of baseline skip prob with clamping
+ // Based on crude best fit of old table.
+ t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
+
+ skip_prob = t;
+ if (skip_prob < 1)
+ skip_prob = 1;
+ else if (skip_prob > 255)
+ skip_prob = 255;
+ base_skip_false_prob[i][1] = skip_prob;
+
+ skip_prob = t * 0.75;
+ if (skip_prob < 1)
+ skip_prob = 1;
+ else if (skip_prob > 255)
+ skip_prob = 255;
+ base_skip_false_prob[i][2] = skip_prob;
+
+ skip_prob = t * 1.25;
+ if (skip_prob < 1)
+ skip_prob = 1;
+ else if (skip_prob > 255)
+ skip_prob = 255;
+ base_skip_false_prob[i][0] = skip_prob;
+ }
+}
+
+static void update_base_skip_probs(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ if (cm->frame_type != KEY_FRAME) {
+ vp9_update_skip_probs(cpi);
+
+ if (cm->refresh_alt_ref_frame) {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
+ cpi->last_skip_probs_q[2] = cm->base_qindex;
+ } else if (cpi->common.refresh_golden_frame) {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
+ cpi->last_skip_probs_q[1] = cm->base_qindex;
+ } else {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];
+ cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+ // update the baseline table for the current q
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ cpi->base_skip_false_prob[cm->base_qindex][k] =
+ cm->mbskip_pred_probs[k];
+ }
+ }
+
+}
+
+void vp9_initialize_enc() {
+ static int init_done = 0;
+
+ if (!init_done) {
+ vp8_scale_machine_specific_config();
+ vp9_initialize_common();
+ vp9_tokenize_initialize();
+ vp9_init_quant_tables();
+ vp9_init_me_luts();
+ init_minq_luts();
+ init_base_skip_probs();
+ init_done = 1;
+ }
+}
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void setup_features(VP9_COMP *cpi) {
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ // Set up default state for MB feature flags
+
+ xd->segmentation_enabled = 0; // Default segmentation disabled
+
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+ vp9_clearall_segfeatures(xd);
+
+ xd->mode_ref_lf_delta_enabled = 0;
+ xd->mode_ref_lf_delta_update = 0;
+ vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+ vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+ set_default_lf_deltas(cpi);
+
+}
+
+
+static void dealloc_compressor_data(VP9_COMP *cpi) {
+ vpx_free(cpi->tplist);
+ cpi->tplist = NULL;
+
+ // Delete last frame MV storage buffers
+ vpx_free(cpi->lfmv);
+ cpi->lfmv = 0;
+
+ vpx_free(cpi->lf_ref_frame_sign_bias);
+ cpi->lf_ref_frame_sign_bias = 0;
+
+ vpx_free(cpi->lf_ref_frame);
+ cpi->lf_ref_frame = 0;
+
+ // Delete sementation map
+ vpx_free(cpi->segmentation_map);
+ cpi->segmentation_map = 0;
+ vpx_free(cpi->common.last_frame_seg_map);
+ cpi->common.last_frame_seg_map = 0;
+ vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+ cpi->coding_context.last_frame_seg_map_copy = 0;
+
+ vpx_free(cpi->active_map);
+ cpi->active_map = 0;
+
+ vp9_de_alloc_frame_buffers(&cpi->common);
+
+ vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+ vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+#if VP9_TEMPORAL_ALT_REF
+ vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+ vp9_lookahead_destroy(cpi->lookahead);
+
+ vpx_free(cpi->tok);
+ cpi->tok = 0;
+
+ // Structure used to monitor GF usage
+ vpx_free(cpi->gf_active_flags);
+ cpi->gf_active_flags = 0;
+
+ // Activity mask based per mb zbin adjustments
+ vpx_free(cpi->mb_activity_map);
+ cpi->mb_activity_map = 0;
+ vpx_free(cpi->mb_norm_activity_map);
+ cpi->mb_norm_activity_map = 0;
+
+ vpx_free(cpi->mb.pip);
+ cpi->mb.pip = 0;
+
+ vpx_free(cpi->twopass.total_stats);
+ cpi->twopass.total_stats = 0;
+
+ vpx_free(cpi->twopass.total_left_stats);
+ cpi->twopass.total_left_stats = 0;
+
+ vpx_free(cpi->twopass.this_frame_stats);
+ cpi->twopass.this_frame_stats = 0;
+}
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target value
+// target q value
+static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
+ int i;
+ int start_index = cpi->worst_quality;
+ int target_index = cpi->worst_quality;
+
+ // Convert the average q value to an index.
+ for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+ start_index = i;
+ if (vp9_convert_qindex_to_q(i) >= qstart)
+ break;
+ }
+
+ // Convert the q target to an index
+ for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+ target_index = i;
+ if (vp9_convert_qindex_to_q(i) >= qtarget)
+ break;
+ }
+
+ return target_index - start_index;
+}
+
+static void init_seg_features(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ int high_q = (int)(cpi->avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation
+ vp9_disable_segmentation((VP9_PTR)cpi);
+
+ // Clear down the segment features.
+ vp9_clearall_segfeatures(xd);
+ }
+
+ // If this is an alt ref frame
+ else if (cm->refresh_alt_ref_frame) {
+ // Clear down the global segmentation map
+ vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation and individual segment features by default
+ vp9_disable_segmentation((VP9_PTR)cpi);
+ vp9_clearall_segfeatures(xd);
+
+ // Scan frames from current to arf frame.
+ // This function re-enables segmentation if appropriate.
+ vp9_update_mbgraph_stats(cpi);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (xd->segmentation_enabled) {
+ xd->update_mb_segmentation_map = 1;
+ xd->update_mb_segmentation_data = 1;
+
+ qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
+ vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
+ vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+
+ vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+
+ // Where relevant assume segment data is delta data
+ xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+ }
+ }
+ // All other frames if segmentation has been enabled
+ else if (xd->segmentation_enabled) {
+ // First normal frame in a valid gf or alt ref group
+ if (cpi->common.frames_since_golden == 0) {
+ // Set up segment features for normal frames in an af group
+ if (cpi->source_alt_ref_active) {
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 1;
+ xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+
+ qi_delta = compute_qdelta(cpi, cpi->avg_q,
+ (cpi->avg_q * 1.125));
+ vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
+ vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+
+ vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+
+ // Segment coding disabled for compred testing
+ if (high_q || (cpi->static_mb_pct == 100)) {
+ // set_segref(xd, 1, LAST_FRAME);
+ vp9_set_segref(xd, 1, ALTREF_FRAME);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+
+ vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
+
+ // EOB segment coding not fixed for 8x8 yet
+ vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
+ }
+ }
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+ else {
+ vp9_disable_segmentation((VP9_PTR)cpi);
+
+ vpx_memset(cpi->segmentation_map, 0,
+ (cm->mb_rows * cm->mb_cols));
+
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+
+ vp9_clearall_segfeatures(xd);
+ }
+ }
+
+ // Special case where we are coding over the top of a previous
+ // alt ref frame
+ // Segment coding disabled for compred testing
+ else if (cpi->is_src_frame_alt_ref) {
+ // Enable mode and ref frame features for segment 0 as well
+ vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);
+
+ // All mbs should use ALTREF_FRAME, ZEROMV exclusively
+ vp9_clear_segref(xd, 0);
+ vp9_set_segref(xd, 0, ALTREF_FRAME);
+ vp9_clear_segref(xd, 1);
+ vp9_set_segref(xd, 1, ALTREF_FRAME);
+ vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);
+ vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);
+
+ // Skip all MBs if high Q
+ if (high_q) {
+ vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);
+ vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);
+ vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);
+ vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);
+ }
+ // Enable data udpate
+ xd->update_mb_segmentation_data = 1;
+ }
+ // All other frames.
+ else {
+ // No updates.. leave things as they are.
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ }
+ }
+}
+
+// DEBUG: Print out the segment id of each MB in the current frame.
+static void print_seg_map(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int row, col;
+ int map_index = 0;
+ FILE *statsfile;
+
+ statsfile = fopen("segmap.stt", "a");
+
+ fprintf(statsfile, "%10d\n",
+ cm->current_video_frame);
+
+ for (row = 0; row < cpi->common.mb_rows; row++) {
+ for (col = 0; col < cpi->common.mb_cols; col++) {
+ fprintf(statsfile, "%10d",
+ cpi->segmentation_map[map_index]);
+ map_index++;
+ }
+ fprintf(statsfile, "\n");
+ }
+ fprintf(statsfile, "\n");
+
+ fclose(statsfile);
+}
+
+static void update_reference_segmentation_map(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+ MODE_INFO *mi = cm->mi;
+ uint8_t *segmap = cpi->segmentation_map;
+ uint8_t *segcache = cm->last_frame_seg_map;
+
+ for (row = 0; row < sb_rows; row++) {
+ for (col = 0; col < sb_cols; col++) {
+ MODE_INFO *miptr = mi + col * 2;
+ uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+ if (miptr->mbmi.encoded_as_sb) {
+ cache[0] = miptr->mbmi.segment_id;
+ if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+ cache[1] = miptr->mbmi.segment_id;
+ if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+ cache[cm->mb_cols] = miptr->mbmi.segment_id;
+ if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+ cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
+ }
+ } else
+#endif
+ {
+ cache[0] = miptr[0].mbmi.segment_id;
+ if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+ cache[1] = miptr[1].mbmi.segment_id;
+ if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+ cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+ if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+ cache[1] = miptr[1].mbmi.segment_id;
+ cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+ }
+ }
+ }
+ segmap += 2 * cm->mb_cols;
+ segcache += 2 * cm->mb_cols;
+ mi += 2 * cm->mode_info_stride;
+ }
+}
+
+static void set_default_lf_deltas(VP9_COMP *cpi) {
+ cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+ vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+ // Test of ref frame deltas
+ cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+ cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+ cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+ cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+ cpi->mb.e_mbd.mode_lf_deltas[0] = 4; // BPRED
+ cpi->mb.e_mbd.mode_lf_deltas[1] = -2; // Zero
+ cpi->mb.e_mbd.mode_lf_deltas[2] = 2; // New mv
+ cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv
+}
+
+void vp9_set_speed_features(VP9_COMP *cpi) {
+ SPEED_FEATURES *sf = &cpi->sf;
+ int Mode = cpi->compressor_speed;
+ int Speed = cpi->Speed;
+ int i;
+ VP9_COMMON *cm = &cpi->common;
+
+ // Only modes 0 and 1 supported for now in experimental code basae
+ if (Mode > 1)
+ Mode = 1;
+
+ // Initialise default mode frequency sampling variables
+ for (i = 0; i < MAX_MODES; i ++) {
+ cpi->mode_check_freq[i] = 0;
+ cpi->mode_test_hit_counts[i] = 0;
+ cpi->mode_chosen_counts[i] = 0;
+ }
+
+ // best quality defaults
+ sf->RD = 1;
+ sf->search_method = NSTEP;
+ sf->improved_dct = 1;
+ sf->auto_filter = 1;
+ sf->recode_loop = 1;
+ sf->quarter_pixel_search = 1;
+ sf->half_pixel_search = 1;
+ sf->iterative_sub_pixel = 1;
+#if CONFIG_LOSSLESS
+ sf->optimize_coefficients = 0;
+#else
+ sf->optimize_coefficients = 1;
+#endif
+ sf->no_skip_block4x4_search = 1;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+ sf->improved_mv_pred = 1;
+
+ // default thresholds to 0
+ for (i = 0; i < MAX_MODES; i++)
+ sf->thresh_mult[i] = 0;
+
+ switch (Mode) {
+ case 0: // best quality mode
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
+ sf->thresh_mult[THR_ZEROG ] = 0;
+ sf->thresh_mult[THR_ZEROG_FILT ] = 0;
+ sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_ZEROA_FILT ] = 0;
+ sf->thresh_mult[THR_NEARESTMV ] = 0;
+ sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+ sf->thresh_mult[THR_NEARESTG ] = 0;
+ sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
+ sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARMV_FILT ] = 0;
+ sf->thresh_mult[THR_NEARG ] = 0;
+ sf->thresh_mult[THR_NEARG_FILT ] = 0;
+ sf->thresh_mult[THR_NEARA ] = 0;
+ sf->thresh_mult[THR_NEARA_FILT ] = 0;
+
+ sf->thresh_mult[THR_DC ] = 0;
+
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_D45_PRED ] = 1000;
+ sf->thresh_mult[THR_D135_PRED] = 1000;
+ sf->thresh_mult[THR_D117_PRED] = 1000;
+ sf->thresh_mult[THR_D153_PRED] = 1000;
+ sf->thresh_mult[THR_D27_PRED ] = 1000;
+ sf->thresh_mult[THR_D63_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2000;
+ sf->thresh_mult[THR_I8X8_PRED] = 2000;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEWMV ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 1000;
+ sf->thresh_mult[THR_NEWMV_FILT ] = 1000;
+ sf->thresh_mult[THR_NEWG_FILT ] = 1000;
+ sf->thresh_mult[THR_NEWA_FILT ] = 1000;
+#else
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROG ] = 0;
+ sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_NEARESTG ] = 0;
+ sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARG ] = 0;
+ sf->thresh_mult[THR_NEARA ] = 0;
+
+ sf->thresh_mult[THR_DC ] = 0;
+
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_D45_PRED ] = 1000;
+ sf->thresh_mult[THR_D135_PRED] = 1000;
+ sf->thresh_mult[THR_D117_PRED] = 1000;
+ sf->thresh_mult[THR_D153_PRED] = 1000;
+ sf->thresh_mult[THR_D27_PRED ] = 1000;
+ sf->thresh_mult[THR_D63_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2000;
+ sf->thresh_mult[THR_I8X8_PRED] = 2000;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEWMV ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 1000;
+#endif
+ sf->thresh_mult[THR_SPLITMV ] = 2500;
+ sf->thresh_mult[THR_SPLITG ] = 5000;
+ sf->thresh_mult[THR_SPLITA ] = 5000;
+
+ sf->thresh_mult[THR_COMP_ZEROLG ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
+ sf->thresh_mult[THR_COMP_NEARLG ] = 0;
+ sf->thresh_mult[THR_COMP_ZEROLA ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
+ sf->thresh_mult[THR_COMP_NEARLA ] = 0;
+ sf->thresh_mult[THR_COMP_ZEROGA ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
+ sf->thresh_mult[THR_COMP_NEARGA ] = 0;
+
+ sf->thresh_mult[THR_COMP_NEWLG ] = 1000;
+ sf->thresh_mult[THR_COMP_NEWLA ] = 1000;
+ sf->thresh_mult[THR_COMP_NEWGA ] = 1000;
+
+ sf->thresh_mult[THR_COMP_SPLITLA ] = 2500;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = 5000;
+ sf->thresh_mult[THR_COMP_SPLITLG ] = 5000;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+ sf->search_best_filter = SEARCH_BEST_FILTER;
+ break;
+ case 1:
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
+ sf->thresh_mult[THR_DC ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARMV_FILT ] = 0;
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_D45_PRED ] = 1000;
+ sf->thresh_mult[THR_D135_PRED] = 1000;
+ sf->thresh_mult[THR_D117_PRED] = 1000;
+ sf->thresh_mult[THR_D153_PRED] = 1000;
+ sf->thresh_mult[THR_D27_PRED ] = 1000;
+ sf->thresh_mult[THR_D63_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2500;
+ sf->thresh_mult[THR_I8X8_PRED] = 2500;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEARESTG ] = 1000;
+ sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;
+ sf->thresh_mult[THR_NEARESTA ] = 1000;
+ sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;
+
+ sf->thresh_mult[THR_ZEROG ] = 1000;
+ sf->thresh_mult[THR_ZEROA ] = 1000;
+ sf->thresh_mult[THR_NEARG ] = 1000;
+ sf->thresh_mult[THR_NEARA ] = 1000;
+ sf->thresh_mult[THR_ZEROG_FILT ] = 1000;
+ sf->thresh_mult[THR_ZEROA_FILT ] = 1000;
+ sf->thresh_mult[THR_NEARG_FILT ] = 1000;
+ sf->thresh_mult[THR_NEARA_FILT ] = 1000;
+
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROG ] = 0;
+ sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_NEARESTG ] = 0;
+ sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARG ] = 0;
+ sf->thresh_mult[THR_NEARA ] = 0;
+ sf->thresh_mult[THR_ZEROMV_FILT ] = 0;
+ sf->thresh_mult[THR_ZEROG_FILT ] = 0;
+ sf->thresh_mult[THR_ZEROA_FILT ] = 0;
+ sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
+ sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
+ sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
+ sf->thresh_mult[THR_NEARMV_FILT ] = 0;
+ sf->thresh_mult[THR_NEARG_FILT ] = 0;
+ sf->thresh_mult[THR_NEARA_FILT ] = 0;
+
+ sf->thresh_mult[THR_NEWMV ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 1000;
+ sf->thresh_mult[THR_NEWMV_FILT ] = 1000;
+ sf->thresh_mult[THR_NEWG_FILT ] = 1000;
+ sf->thresh_mult[THR_NEWA_FILT ] = 1000;
+#else
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_DC ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_V_PRED ] = 1000;
+ sf->thresh_mult[THR_H_PRED ] = 1000;
+ sf->thresh_mult[THR_D45_PRED ] = 1000;
+ sf->thresh_mult[THR_D135_PRED] = 1000;
+ sf->thresh_mult[THR_D117_PRED] = 1000;
+ sf->thresh_mult[THR_D153_PRED] = 1000;
+ sf->thresh_mult[THR_D27_PRED ] = 1000;
+ sf->thresh_mult[THR_D63_PRED ] = 1000;
+ sf->thresh_mult[THR_B_PRED ] = 2500;
+ sf->thresh_mult[THR_I8X8_PRED] = 2500;
+ sf->thresh_mult[THR_TM ] = 1000;
+
+ sf->thresh_mult[THR_NEARESTG ] = 1000;
+ sf->thresh_mult[THR_NEARESTA ] = 1000;
+
+ sf->thresh_mult[THR_ZEROG ] = 1000;
+ sf->thresh_mult[THR_ZEROA ] = 1000;
+ sf->thresh_mult[THR_NEARG ] = 1000;
+ sf->thresh_mult[THR_NEARA ] = 1000;
+
+ sf->thresh_mult[THR_ZEROMV ] = 0;
+ sf->thresh_mult[THR_ZEROG ] = 0;
+ sf->thresh_mult[THR_ZEROA ] = 0;
+ sf->thresh_mult[THR_NEARESTMV] = 0;
+ sf->thresh_mult[THR_NEARESTG ] = 0;
+ sf->thresh_mult[THR_NEARESTA ] = 0;
+ sf->thresh_mult[THR_NEARMV ] = 0;
+ sf->thresh_mult[THR_NEARG ] = 0;
+ sf->thresh_mult[THR_NEARA ] = 0;
+
+ sf->thresh_mult[THR_NEWMV ] = 1000;
+ sf->thresh_mult[THR_NEWG ] = 1000;
+ sf->thresh_mult[THR_NEWA ] = 1000;
+#endif
+ sf->thresh_mult[THR_SPLITMV ] = 1700;
+ sf->thresh_mult[THR_SPLITG ] = 4500;
+ sf->thresh_mult[THR_SPLITA ] = 4500;
+
+ sf->thresh_mult[THR_COMP_ZEROLG ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTLG] = 0;
+ sf->thresh_mult[THR_COMP_NEARLG ] = 0;
+ sf->thresh_mult[THR_COMP_ZEROLA ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTLA] = 0;
+ sf->thresh_mult[THR_COMP_NEARLA ] = 0;
+ sf->thresh_mult[THR_COMP_ZEROGA ] = 0;
+ sf->thresh_mult[THR_COMP_NEARESTGA] = 0;
+ sf->thresh_mult[THR_COMP_NEARGA ] = 0;
+
+ sf->thresh_mult[THR_COMP_NEWLG ] = 1000;
+ sf->thresh_mult[THR_COMP_NEWLA ] = 1000;
+ sf->thresh_mult[THR_COMP_NEWGA ] = 1000;
+
+ sf->thresh_mult[THR_COMP_SPLITLA ] = 1700;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = 4500;
+ sf->thresh_mult[THR_COMP_SPLITLG ] = 4500;
+
+ if (Speed > 0) {
+ /* Disable coefficient optimization above speed 0 */
+ sf->optimize_coefficients = 0;
+ sf->no_skip_block4x4_search = 0;
+
+ sf->first_step = 1;
+
+ cpi->mode_check_freq[THR_SPLITG] = 2;
+ cpi->mode_check_freq[THR_SPLITA] = 2;
+ cpi->mode_check_freq[THR_SPLITMV] = 0;
+
+ cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
+ cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
+ cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
+ }
+
+ if (Speed > 1) {
+ cpi->mode_check_freq[THR_SPLITG] = 4;
+ cpi->mode_check_freq[THR_SPLITA] = 4;
+ cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+ cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
+ cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
+ cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
+
+ sf->thresh_mult[THR_TM ] = 1500;
+ sf->thresh_mult[THR_V_PRED ] = 1500;
+ sf->thresh_mult[THR_H_PRED ] = 1500;
+ sf->thresh_mult[THR_D45_PRED ] = 1500;
+ sf->thresh_mult[THR_D135_PRED] = 1500;
+ sf->thresh_mult[THR_D117_PRED] = 1500;
+ sf->thresh_mult[THR_D153_PRED] = 1500;
+ sf->thresh_mult[THR_D27_PRED ] = 1500;
+ sf->thresh_mult[THR_D63_PRED ] = 1500;
+ sf->thresh_mult[THR_B_PRED ] = 5000;
+ sf->thresh_mult[THR_I8X8_PRED] = 5000;
+
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEWMV_FILT ] = 2000;
+#endif
+ sf->thresh_mult[THR_SPLITMV ] = 10000;
+ sf->thresh_mult[THR_COMP_SPLITLG ] = 20000;
+ }
+
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ sf->thresh_mult[THR_NEARESTG ] = 1500;
+ sf->thresh_mult[THR_ZEROG ] = 1500;
+ sf->thresh_mult[THR_NEARG ] = 1500;
+ sf->thresh_mult[THR_NEWG ] = 2000;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;
+ sf->thresh_mult[THR_ZEROG_FILT ] = 1500;
+ sf->thresh_mult[THR_NEARG_FILT ] = 1500;
+ sf->thresh_mult[THR_NEWG_FILT ] = 2000;
+#endif
+ sf->thresh_mult[THR_SPLITG ] = 20000;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = 20000;
+ }
+
+ if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+ sf->thresh_mult[THR_NEARESTA ] = 1500;
+ sf->thresh_mult[THR_ZEROA ] = 1500;
+ sf->thresh_mult[THR_NEARA ] = 1500;
+ sf->thresh_mult[THR_NEWA ] = 2000;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;
+ sf->thresh_mult[THR_ZEROA_FILT ] = 1500;
+ sf->thresh_mult[THR_NEARA_FILT ] = 1500;
+ sf->thresh_mult[THR_NEWA_FILT ] = 2000;
+#endif
+ sf->thresh_mult[THR_SPLITA ] = 20000;
+ sf->thresh_mult[THR_COMP_SPLITLA ] = 10000;
+ }
+
+ sf->thresh_mult[THR_COMP_ZEROLG ] = 1500;
+ sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;
+ sf->thresh_mult[THR_COMP_NEARLG ] = 1500;
+ sf->thresh_mult[THR_COMP_ZEROLA ] = 1500;
+ sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;
+ sf->thresh_mult[THR_COMP_NEARLA ] = 1500;
+ sf->thresh_mult[THR_COMP_ZEROGA ] = 1500;
+ sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;
+ sf->thresh_mult[THR_COMP_NEARGA ] = 1500;
+
+ sf->thresh_mult[THR_COMP_NEWLG ] = 2000;
+ sf->thresh_mult[THR_COMP_NEWLA ] = 2000;
+ sf->thresh_mult[THR_COMP_NEWGA ] = 2000;
+ }
+
+ if (Speed > 2) {
+ cpi->mode_check_freq[THR_SPLITG] = 15;
+ cpi->mode_check_freq[THR_SPLITA] = 15;
+ cpi->mode_check_freq[THR_SPLITMV] = 7;
+
+ cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
+ cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
+ cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
+
+ sf->thresh_mult[THR_TM ] = 2000;
+ sf->thresh_mult[THR_V_PRED ] = 2000;
+ sf->thresh_mult[THR_H_PRED ] = 2000;
+ sf->thresh_mult[THR_D45_PRED ] = 2000;
+ sf->thresh_mult[THR_D135_PRED] = 2000;
+ sf->thresh_mult[THR_D117_PRED] = 2000;
+ sf->thresh_mult[THR_D153_PRED] = 2000;
+ sf->thresh_mult[THR_D27_PRED ] = 2000;
+ sf->thresh_mult[THR_D63_PRED ] = 2000;
+ sf->thresh_mult[THR_B_PRED ] = 7500;
+ sf->thresh_mult[THR_I8X8_PRED] = 7500;
+
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ sf->thresh_mult[THR_NEWMV ] = 2000;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEWMV_FILT ] = 2000;
+#endif
+ sf->thresh_mult[THR_SPLITMV ] = 25000;
+ sf->thresh_mult[THR_COMP_SPLITLG ] = 50000;
+ }
+
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ sf->thresh_mult[THR_NEARESTG ] = 2000;
+ sf->thresh_mult[THR_ZEROG ] = 2000;
+ sf->thresh_mult[THR_NEARG ] = 2000;
+ sf->thresh_mult[THR_NEWG ] = 2500;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;
+ sf->thresh_mult[THR_ZEROG_FILT ] = 2000;
+ sf->thresh_mult[THR_NEARG_FILT ] = 2000;
+ sf->thresh_mult[THR_NEWG_FILT ] = 2500;
+#endif
+ sf->thresh_mult[THR_SPLITG ] = 50000;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = 50000;
+ }
+
+ if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+ sf->thresh_mult[THR_NEARESTA ] = 2000;
+ sf->thresh_mult[THR_ZEROA ] = 2000;
+ sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEWA ] = 2500;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;
+ sf->thresh_mult[THR_ZEROA_FILT ] = 2000;
+ sf->thresh_mult[THR_NEARA_FILT ] = 2000;
+ sf->thresh_mult[THR_NEWA_FILT ] = 2500;
+#endif
+ sf->thresh_mult[THR_SPLITA ] = 50000;
+ sf->thresh_mult[THR_COMP_SPLITLA ] = 25000;
+ }
+
+ sf->thresh_mult[THR_COMP_ZEROLG ] = 2000;
+ sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;
+ sf->thresh_mult[THR_COMP_NEARLG ] = 2000;
+ sf->thresh_mult[THR_COMP_ZEROLA ] = 2000;
+ sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+ sf->thresh_mult[THR_COMP_NEARLA ] = 2000;
+ sf->thresh_mult[THR_COMP_ZEROGA ] = 2000;
+ sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;
+ sf->thresh_mult[THR_COMP_NEARGA ] = 2000;
+
+ sf->thresh_mult[THR_COMP_NEWLG ] = 2500;
+ sf->thresh_mult[THR_COMP_NEWLA ] = 2500;
+ sf->thresh_mult[THR_COMP_NEWGA ] = 2500;
+
+ sf->improved_dct = 0;
+
+ // Only do recode loop on key frames, golden frames and
+ // alt ref frames
+ sf->recode_loop = 2;
+
+ }
+
+ break;
+
+ }; /* switch */
+
+ /* disable frame modes if flags not set */
+ if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
+ sf->thresh_mult[THR_NEWMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+ sf->thresh_mult[THR_ZEROMV ] = INT_MAX;
+ sf->thresh_mult[THR_NEARMV ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEWMV_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;
+ sf->thresh_mult[THR_ZEROMV_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEARMV_FILT ] = INT_MAX;
+#endif
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ }
+
+ if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+ sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROG ] = INT_MAX;
+ sf->thresh_mult[THR_NEARG ] = INT_MAX;
+ sf->thresh_mult[THR_NEWG ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROG_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEARG_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEWG_FILT ] = INT_MAX;
+#endif
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ }
+
+ if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
+ sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROA ] = INT_MAX;
+ sf->thresh_mult[THR_NEARA ] = INT_MAX;
+ sf->thresh_mult[THR_NEWA ] = INT_MAX;
+#if CONFIG_PRED_FILTER
+ sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_ZEROA_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEARA_FILT ] = INT_MAX;
+ sf->thresh_mult[THR_NEWA_FILT ] = INT_MAX;
+#endif
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+ }
+
+ if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
+ sf->thresh_mult[THR_COMP_ZEROLG ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARLG ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEWLG ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_SPLITLG ] = INT_MAX;
+ }
+
+ if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
+ sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
+ }
+
+ if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
+ sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
+ }
+
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (cpi->pass == 1) {
+ sf->optimize_coefficients = 0;
+ sf->improved_dct = 0;
+ }
+
+ if (cpi->sf.search_method == NSTEP) {
+ vp9_init3smotion_compensation(&cpi->mb,
+ cm->yv12_fb[cm->lst_fb_idx].y_stride);
+ } else if (cpi->sf.search_method == DIAMOND) {
+ vp9_init_dsmotion_compensation(&cpi->mb,
+ cm->yv12_fb[cm->lst_fb_idx].y_stride);
+ }
+
+ cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;
+ cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;
+ cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;
+ cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;
+ cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
+ cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+
+#if CONFIG_LOSSLESS
+ if (cpi->oxcf.lossless) {
+ cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;
+ cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;
+ cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;
+ cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;
+ cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;
+ }
+#endif
+
+
+
+ cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4;
+ cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
+ cpi->mb.quantize_b_8x8 = vp9_regular_quantize_b_8x8;
+ cpi->mb.quantize_b_16x16 = vp9_regular_quantize_b_16x16;
+ cpi->mb.quantize_b_2x2 = vp9_regular_quantize_b_2x2;
+
+ vp9_init_quantizer(cpi);
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
+#endif
+
+ if (cpi->sf.iterative_sub_pixel == 1) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
+ } else if (cpi->sf.quarter_pixel_search) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;
+ } else if (cpi->sf.half_pixel_search) {
+ cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
+ }
+
+ if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)
+ cpi->mb.optimize = 1;
+ else
+ cpi->mb.optimize = 0;
+
+#ifdef SPEEDSTATS
+ frames_at_speed[cpi->Speed]++;
+#endif
+}
+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+ int width = (cpi->oxcf.Width + 15) & ~15;
+ int height = (cpi->oxcf.Height + 15) & ~15;
+
+ cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+ cpi->oxcf.lag_in_frames);
+ if (!cpi->lookahead)
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+
+#if VP9_TEMPORAL_ALT_REF
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+
+#endif
+}
+
+static int alloc_partition_data(VP9_COMP *cpi) {
+ vpx_free(cpi->mb.pip);
+
+ cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
+ (cpi->common.mb_rows + 1),
+ sizeof(PARTITION_INFO));
+ if (!cpi->mb.pip)
+ return 1;
+
+ cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
+
+ return 0;
+}
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ int width = cm->Width;
+ int height = cm->Height;
+
+ if (vp9_alloc_frame_buffers(cm, width, height))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+
+ if (alloc_partition_data(cpi))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate partition data");
+
+
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+
+ vpx_free(cpi->tok);
+
+ {
+ unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+
+ CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ }
+
+ // Data used for real time vc mode to see if gf needs refreshing
+ cpi->inter_zz_count = 0;
+ cpi->gf_bad_count = 0;
+ cpi->gf_update_recommended = 0;
+
+
+ // Structures used to minitor GF usage
+ vpx_free(cpi->gf_active_flags);
+ CHECK_MEM_ERROR(cpi->gf_active_flags,
+ vpx_calloc(1, cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ vpx_free(cpi->mb_activity_map);
+ CHECK_MEM_ERROR(cpi->mb_activity_map,
+ vpx_calloc(sizeof(unsigned int),
+ cm->mb_rows * cm->mb_cols));
+
+ vpx_free(cpi->mb_norm_activity_map);
+ CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
+ vpx_calloc(sizeof(unsigned int),
+ cm->mb_rows * cm->mb_cols));
+
+ vpx_free(cpi->twopass.total_stats);
+
+ cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+ vpx_free(cpi->twopass.total_left_stats);
+ cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+ vpx_free(cpi->twopass.this_frame_stats);
+
+ cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
+
+ if (!cpi->twopass.total_stats ||
+ !cpi->twopass.total_left_stats ||
+ !cpi->twopass.this_frame_stats)
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate firstpass stats");
+
+ vpx_free(cpi->tplist);
+
+ CHECK_MEM_ERROR(cpi->tplist,
+ vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
+}
+
+
+// TODO perhaps change number of steps expose to outside world when setting
+// max and min limits. Also this will likely want refining for the extended Q
+// range.
+//
+// Table that converts 0-63 Q range values passed in outside to the Qindex
+// range used internally.
+static const int q_trans[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28,
+ 32, 36, 40, 44, 48, 52, 56, 60,
+ 64, 68, 72, 76, 80, 84, 88, 92,
+ 96, 100, 104, 108, 112, 116, 120, 124,
+ 128, 132, 136, 140, 144, 148, 152, 156,
+ 160, 164, 168, 172, 176, 180, 184, 188,
+ 192, 196, 200, 204, 208, 212, 216, 220,
+ 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp9_reverse_trans(int x) {
+ int i;
+
+ for (i = 0; i < 64; i++)
+ if (q_trans[i] >= x)
+ return i;
+
+ return 63;
+};
+void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
+ if (framerate < .1)
+ framerate = 30;
+
+ cpi->oxcf.frame_rate = framerate;
+ cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+ cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)
+ cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
+
+ // Set Maximum gf/arf interval
+ cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+
+ if (cpi->max_gf_interval < 12)
+ cpi->max_gf_interval = 12;
+
+ // Extended interval for genuinely static scenes
+ cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+ // Special conditions when altr ref frame enabled in lagged compress mode
+ if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
+ if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+ if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+ }
+
+ if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
+ cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+}
+
+
+static int
+rescale(int val, int num, int denom) {
+ int64_t llnum = num;
+ int64_t llden = denom;
+ int64_t llval = val;
+
+ return llval * llnum / llden;
+}
+
+
+static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMMON *cm = &cpi->common;
+
+ cpi->oxcf = *oxcf;
+
+ cpi->goldfreq = 7;
+
+ cm->version = oxcf->Version;
+ vp9_setup_version(cm);
+
+ // change includes all joint functionality
+ vp9_change_config(ptr, oxcf);
+
+ // Initialize active best and worst q and average q values.
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+
+ // Initialise the starting buffer levels
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+
+ cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
+
+ cpi->total_actual_bits = 0;
+ cpi->total_target_vs_actual = 0;
+
+ cpi->static_mb_pct = 0;
+
+#if VP9_TEMPORAL_ALT_REF
+ {
+ int i;
+
+ cpi->fixed_divide[0] = 0;
+
+ for (i = 1; i < 512; i++)
+ cpi->fixed_divide[i] = 0x80000 / i;
+ }
+#endif
+}
+
+
+void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMMON *cm = &cpi->common;
+
+ if (!cpi)
+ return;
+
+ if (!oxcf)
+ return;
+
+ if (cm->version != oxcf->Version) {
+ cm->version = oxcf->Version;
+ vp9_setup_version(cm);
+ }
+
+ cpi->oxcf = *oxcf;
+
+ switch (cpi->oxcf.Mode) {
+ // Real time and one pass deprecated in test code base
+ case MODE_FIRSTPASS:
+ cpi->pass = 1;
+ cpi->compressor_speed = 1;
+ break;
+
+ case MODE_SECONDPASS:
+ cpi->pass = 2;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5) {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+
+ case MODE_SECONDPASS_BEST:
+ cpi->pass = 2;
+ cpi->compressor_speed = 0;
+ break;
+ }
+
+ cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+ cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+ cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
+
+#if CONFIG_LOSSLESS
+ cpi->oxcf.lossless = oxcf->lossless;
+ if (cpi->oxcf.lossless) {
+ cpi->common.rtcd.idct.idct1 = vp9_short_inv_walsh4x4_1_x8_c;
+ cpi->common.rtcd.idct.idct16 = vp9_short_inv_walsh4x4_x8_c;
+ cpi->common.rtcd.idct.idct1_scalar_add = vp9_dc_only_inv_walsh_add_c;
+ cpi->common.rtcd.idct.iwalsh1 = vp9_short_inv_walsh4x4_1_c;
+ cpi->common.rtcd.idct.iwalsh16 = vp9_short_inv_walsh4x4_lossless_c;
+ }
+#endif
+
+ cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+ cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+ // cpi->use_golden_frame_only = 0;
+ // cpi->use_last_frame_only = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->refresh_entropy_probs = 1;
+
+ setup_features(cpi);
+ cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation
+
+ {
+ int i;
+
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+ }
+
+ // At the moment the first order values may not be > MAXQ
+ if (cpi->oxcf.fixed_q > MAXQ)
+ cpi->oxcf.fixed_q = MAXQ;
+
+ // local file playback mode == really big buffer
+ if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
+ cpi->oxcf.starting_buffer_level = 60000;
+ cpi->oxcf.optimal_buffer_level = 60000;
+ cpi->oxcf.maximum_buffer_size = 240000;
+ }
+
+ // Convert target bandwidth from Kbit/s to Bit/s
+ cpi->oxcf.target_bandwidth *= 1000;
+
+ cpi->oxcf.starting_buffer_level =
+ rescale(cpi->oxcf.starting_buffer_level,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ // Set or reset optimal and maximum buffer levels.
+ if (cpi->oxcf.optimal_buffer_level == 0)
+ cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.optimal_buffer_level =
+ rescale(cpi->oxcf.optimal_buffer_level,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ if (cpi->oxcf.maximum_buffer_size == 0)
+ cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.maximum_buffer_size =
+ rescale(cpi->oxcf.maximum_buffer_size,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ // Set up frame rate and related parameters rate control values.
+ vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+ // Set absolute upper and lower quality limits
+ cpi->worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->best_quality = cpi->oxcf.best_allowed_q;
+
+ // active values should only be modified if out of new range
+ if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ }
+ // less likely
+ else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {
+ cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+ }
+ if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ }
+ // less likely
+ else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {
+ cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+ }
+
+ cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
+
+ if (!cm->use_bilinear_mc_filter)
+ cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
+ else
+ cm->mcomp_filter_type = BILINEAR;
+
+ cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+ cm->Width = cpi->oxcf.Width;
+ cm->Height = cpi->oxcf.Height;
+
+ cm->horiz_scale = cpi->horiz_scale;
+ cm->vert_scale = cpi->vert_scale;
+
+ // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+ if (cpi->oxcf.Sharpness > 7)
+ cpi->oxcf.Sharpness = 7;
+
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ // always go to the next whole number
+ cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+ cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+ }
+
+ if (((cm->Width + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_width ||
+ ((cm->Height + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_height ||
+ cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {
+ alloc_raw_frame_buffers(cpi);
+ vp9_alloc_compressor_data(cpi);
+ }
+
+ if (cpi->oxcf.fixed_q >= 0) {
+ cpi->last_q[0] = cpi->oxcf.fixed_q;
+ cpi->last_q[1] = cpi->oxcf.fixed_q;
+ cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
+ }
+
+ cpi->Speed = cpi->oxcf.cpu_used;
+
+ // force to allowlag to 0 if lag_in_frames is 0;
+ if (cpi->oxcf.lag_in_frames == 0) {
+ cpi->oxcf.allow_lag = 0;
+ }
+ // Limit on lag buffers as these are not currently dynamically allocated
+ else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+ cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+ // YX Temp
+ cpi->alt_ref_source = NULL;
+ cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+ // Experimental RD Code
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+ mvjointsadcost[0] = 600;
+ mvjointsadcost[1] = 300;
+ mvjointsadcost[2] = 300;
+ mvjointsadcost[0] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+ int i = 1;
+
+ mvsadcost [0] [0] = 0;
+ mvsadcost [1] [0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost [0][i] = (int) z;
+ mvsadcost [1][i] = (int) z;
+ mvsadcost [0][-i] = (int) z;
+ mvsadcost [1][-i] = (int) z;
+ } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+ int i = 1;
+
+ mvsadcost [0] [0] = 0;
+ mvsadcost [1] [0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost [0][i] = (int) z;
+ mvsadcost [1][i] = (int) z;
+ mvsadcost [0][-i] = (int) z;
+ mvsadcost [1][-i] = (int) z;
+ } while (++i <= MV_MAX);
+}
+
+VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
+ int i;
+ volatile union {
+ VP9_COMP *cpi;
+ VP9_PTR ptr;
+ } ctx;
+
+ VP9_COMP *cpi;
+ VP9_COMMON *cm;
+
+ cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
+ // Check that the CPI instance is valid
+ if (!cpi)
+ return 0;
+
+ cm = &cpi->common;
+
+ vpx_memset(cpi, 0, sizeof(VP9_COMP));
+
+ if (setjmp(cm->error.jmp)) {
+ VP9_PTR ptr = ctx.ptr;
+
+ ctx.cpi->common.error.setjmp = 0;
+ vp9_remove_compressor(&ptr);
+ return 0;
+ }
+
+ cpi->common.error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+ vp9_create_common(&cpi->common);
+ vp9_cmachine_specific_config(cpi);
+
+ init_config((VP9_PTR)cpi, oxcf);
+
+ memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));
+ cpi->common.current_video_frame = 0;
+ cpi->kf_overspend_bits = 0;
+ cpi->kf_bitrate_adjustment = 0;
+ cpi->frames_till_gf_update_due = 0;
+ cpi->gf_overspend_bits = 0;
+ cpi->non_gf_bitrate_adjustment = 0;
+ cm->prob_last_coded = 128;
+ cm->prob_gf_coded = 128;
+ cm->prob_intra_coded = 63;
+#if CONFIG_SUPERBLOCKS
+ cm->sb_coded = 200;
+#endif
+ for (i = 0; i < COMP_PRED_CONTEXTS; i++)
+ cm->prob_comppred[i] = 128;
+ for (i = 0; i < TX_SIZE_MAX - 1; i++)
+ cm->prob_tx[i] = 128;
+
+ // Prime the recent reference frame useage counters.
+ // Hereafter they will be maintained as a sort of moving average
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+ // Set reference frame sign bias for ALTREF frame to 1 (for now)
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+ cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+ cpi->gold_is_last = 0;
+ cpi->alt_is_last = 0;
+ cpi->gold_is_alt = 0;
+
+ // allocate memory for storing last frame's MVs for MV prediction.
+ CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));
+ CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
+ CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
+
+ // Create the encoder segmentation map and set all entries to 0
+ CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+ // And a copy in common for temporal coding
+ CHECK_MEM_ERROR(cm->last_frame_seg_map,
+ vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+ // And a place holder structure is the coding context
+ // for use if we want to save and restore it
+ CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
+ vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+ CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+ vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+ cpi->active_map_enabled = 0;
+
+ for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
+ sizeof(cpi->mbgraph_stats[0])); i++) {
+ CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
+ vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
+ sizeof(*cpi->mbgraph_stats[i].mb_stats),
+ 1));
+ }
+
+#ifdef ENTROPY_STATS
+ if (cpi->pass != 1)
+ init_context_counters();
+#endif
+#ifdef MODE_STATS
+ vp9_zero(y_modes);
+ vp9_zero(i8x8_modes);
+ vp9_zero(uv_modes);
+ vp9_zero(uv_modes_y);
+ vp9_zero(b_modes);
+ vp9_zero(inter_y_modes);
+ vp9_zero(inter_uv_modes);
+ vp9_zero(inter_b_modes);
+#endif
+#ifdef NMV_STATS
+ init_nmvstats();
+#endif
+
+ /*Initialize the feed-forward activity masking.*/
+ cpi->activity_avg = 90 << 12;
+
+ cpi->frames_since_key = 8; // Give a sensible default for the first frame.
+ cpi->key_frame_frequency = cpi->oxcf.key_freq;
+ cpi->this_key_frame_forced = FALSE;
+ cpi->next_key_frame_forced = FALSE;
+
+ cpi->source_alt_ref_pending = FALSE;
+ cpi->source_alt_ref_active = FALSE;
+ cpi->common.refresh_alt_ref_frame = 0;
+
+ cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+ cpi->b_calculate_ssimg = 0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr) {
+ cpi->total_sq_error = 0.0;
+ cpi->total_sq_error2 = 0.0;
+ cpi->total_y = 0.0;
+ cpi->total_u = 0.0;
+ cpi->total_v = 0.0;
+ cpi->total = 0.0;
+ cpi->totalp_y = 0.0;
+ cpi->totalp_u = 0.0;
+ cpi->totalp_v = 0.0;
+ cpi->totalp = 0.0;
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ }
+
+ if (cpi->b_calculate_ssimg) {
+ cpi->total_ssimg_y = 0;
+ cpi->total_ssimg_u = 0;
+ cpi->total_ssimg_v = 0;
+ cpi->total_ssimg_all = 0;
+ }
+
+#endif
+
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
+ cpi->first_time_stamp_ever = LLONG_MAX;
+
+ cpi->frames_till_gf_update_due = 0;
+ cpi->key_frame_count = 1;
+
+ cpi->ni_av_qi = cpi->oxcf.worst_allowed_q;
+ cpi->ni_tot_qi = 0;
+ cpi->ni_frames = 0;
+ cpi->tot_q = 0.0;
+ cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
+ cpi->total_byte_count = 0;
+
+ cpi->rate_correction_factor = 1.0;
+ cpi->key_frame_rate_correction_factor = 1.0;
+ cpi->gf_rate_correction_factor = 1.0;
+ cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+ cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
+ cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
+ cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
+ cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
+ cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+ cal_nmvsadcosts(cpi->mb.nmvsadcost);
+
+ cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
+ cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
+ cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
+ cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+ cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
+ cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+ }
+
+#ifdef OUTPUT_YUV_SRC
+ yuv_file = fopen("bd.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+ framepsnr = fopen("framepsnr.stt", "a");
+ kf_list = fopen("kf_list.stt", "w");
+#endif
+
+ cpi->output_pkt_list = oxcf->output_pkt_list;
+
+ if (cpi->pass == 1) {
+ vp9_init_first_pass(cpi);
+ } else if (cpi->pass == 2) {
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ int packets = oxcf->two_pass_stats_in.sz / packet_sz;
+
+ cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+ cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+ cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
+ + (packets - 1) * packet_sz);
+ vp9_init_second_pass(cpi);
+ }
+
+ vp9_set_speed_features(cpi);
+
+ // Set starting values of RD threshold multipliers (128 = *1)
+ for (i = 0; i < MAX_MODES; i++) {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+
+#ifdef ENTROPY_STATS
+ init_mv_ref_counts();
+#endif
+
+#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \
+ cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \
+ cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
+ cpi->fn_ptr[BT].sdx3f = SDX3F; \
+ cpi->fn_ptr[BT].sdx8f = SDX8F; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+
+#if CONFIG_SUPERBLOCKS
+ BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+ vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+ vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
+ vp9_sad32x32x4d)
+#endif
+
+ BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+ vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
+ vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+ vp9_sad16x16x4d)
+
+ BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+ NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+
+ BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+ NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+
+ BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+ NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+
+ BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+ NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+
+#if ARCH_X86 || ARCH_X86_64
+ cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn;
+ cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn;
+ cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn;
+ cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn;
+ cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn;
+#endif
+
+ cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
+ cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
+ cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
+
+ // make sure frame 1 is okay
+ cpi->error_bins[0] = cpi->common.MBs;
+
+ /* vp9_init_quantizer() is first called here. Add check in
+ * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
+ * called later when needed. This will avoid unnecessary calls of
+ * vp9_init_quantizer() for every frame.
+ */
+ vp9_init_quantizer(cpi);
+
+ vp9_loop_filter_init(cm);
+
+ cpi->common.error.setjmp = 0;
+
+ vp9_zero(cpi->y_uv_mode_count)
+
+ return (VP9_PTR) cpi;
+}
+
+void vp9_remove_compressor(VP9_PTR *ptr) {
+ VP9_COMP *cpi = (VP9_COMP *)(*ptr);
+ int i;
+
+ if (!cpi)
+ return;
+
+ if (cpi && (cpi->common.current_video_frame > 0)) {
+ if (cpi->pass == 2) {
+ vp9_end_second_pass(cpi);
+ }
+
+#ifdef ENTROPY_STATS
+ if (cpi->pass != 1) {
+ print_context_counters();
+ print_tree_update_probs();
+ print_mode_context();
+ }
+#endif
+#ifdef NMV_STATS
+ if (cpi->pass != 1)
+ print_nmvstats();
+#endif
+
+#if CONFIG_INTERNAL_STATS
+
+ vp9_clear_system_state();
+
+ // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
+ if (cpi->pass != 1) {
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded = (cpi->last_end_time_stamp_seen
+ - cpi->first_time_stamp_ever) / 10000000.000;
+ double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+#if defined(MODE_STATS)
+ print_mode_contexts(&cpi->common);
+#endif
+ if (cpi->b_calculate_psnr) {
+ YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+ double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
+ double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
+ double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
+ double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+
+ fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(ms)\n");
+ fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
+ dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+ total_encode_time);
+// fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
+// dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+// total_encode_time, cpi->tot_recode_hits);
+ }
+
+ if (cpi->b_calculate_ssimg) {
+ fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n");
+ fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+ cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
+// fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f %10ld\n", dr,
+// cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+// cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+ }
+
+ fclose(f);
+ }
+
+#endif
+
+
+#ifdef MODE_STATS
+ {
+ extern int count_mb_seg[4];
+ char modes_stats_file[250];
+ FILE *f;
+ double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;
+ sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);
+ f = fopen(modes_stats_file, "w");
+ fprintf(f, "intra_mode in Intra Frames:\n");
+ {
+ int i;
+ fprintf(f, "Y: ");
+ for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);
+ fprintf(f, "\n");
+ }
+ {
+ int i;
+ fprintf(f, "I8: ");
+ for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);
+ fprintf(f, "\n");
+ }
+ {
+ int i;
+ fprintf(f, "UV: ");
+ for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);
+ fprintf(f, "\n");
+ }
+ {
+ int i, j;
+ fprintf(f, "KeyFrame Y-UV:\n");
+ for (i = 0; i < VP9_YMODES; i++) {
+ fprintf(f, "%2d:", i);
+ for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);
+ fprintf(f, "\n");
+ }
+ }
+ {
+ int i, j;
+ fprintf(f, "Inter Y-UV:\n");
+ for (i = 0; i < VP9_YMODES; i++) {
+ fprintf(f, "%2d:", i);
+ for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);
+ fprintf(f, "\n");
+ }
+ }
+ {
+ int i;
+
+ fprintf(f, "B: ");
+ for (i = 0; i < VP9_BINTRAMODES; i++)
+ fprintf(f, "%8d, ", b_modes[i]);
+
+ fprintf(f, "\n");
+
+ }
+
+ fprintf(f, "Modes in Inter Frames:\n");
+ {
+ int i;
+ fprintf(f, "Y: ");
+ for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);
+ fprintf(f, "\n");
+ }
+ {
+ int i;
+ fprintf(f, "UV: ");
+ for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);
+ fprintf(f, "\n");
+ }
+ {
+ int i;
+ fprintf(f, "B: ");
+ for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);
+ fprintf(f, "\n");
+ }
+ fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+ fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+ fclose(f);
+ }
+#endif
+
+#ifdef ENTROPY_STATS
+ {
+ int i, j, k;
+ FILE *fmode = fopen("modecontext.c", "w");
+
+ fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+ fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
+ fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");
+
+ for (i = 0; i < 10; i++) {
+
+ fprintf(fmode, " { // Above Mode : %d\n", i);
+
+ for (j = 0; j < 10; j++) {
+
+ fprintf(fmode, " {");
+
+ for (k = 0; k < VP9_BINTRAMODES; k++) {
+ if (!intra_mode_stats[i][j][k])
+ fprintf(fmode, " %5d, ", 1);
+ else
+ fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+ }
+
+ fprintf(fmode, "}, // left_mode %d\n", j);
+
+ }
+
+ fprintf(fmode, " },\n");
+
+ }
+
+ fprintf(fmode, "};\n");
+ fclose(fmode);
+ }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (0) {
+ int i;
+ FILE *f = fopen("tokenbits.stt", "a");
+
+ for (i = 0; i < 28; i++)
+ fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+ fprintf(f, "\n");
+ fclose(f);
+ }
+
+#endif
+
+#if 0
+ {
+ printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+ printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+ }
+#endif
+
+ }
+
+ dealloc_compressor_data(cpi);
+ vpx_free(cpi->mb.ss);
+ vpx_free(cpi->tok);
+
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+ vpx_free(cpi->mbgraph_stats[i].mb_stats);
+ }
+
+ vp9_remove_common(&cpi->common);
+ vpx_free(cpi);
+ *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+ fclose(yuv_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+ if (keyfile)
+ fclose(keyfile);
+
+ if (framepsnr)
+ fclose(framepsnr);
+
+ if (kf_list)
+ fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+ unsigned char *recon, int recon_stride,
+ unsigned int cols, unsigned int rows) {
+ unsigned int row, col;
+ uint64_t total_sse = 0;
+ int diff;
+
+ for (row = 0; row + 16 <= rows; row += 16) {
+ for (col = 0; col + 16 <= cols; col += 16) {
+ unsigned int sse;
+
+ vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
+ total_sse += sse;
+ }
+
+ /* Handle odd-sized width */
+ if (col < cols) {
+ unsigned int border_row, border_col;
+ unsigned char *border_orig = orig;
+ unsigned char *border_recon = recon;
+
+ for (border_row = 0; border_row < 16; border_row++) {
+ for (border_col = col; border_col < cols; border_col++) {
+ diff = border_orig[border_col] - border_recon[border_col];
+ total_sse += diff * diff;
+ }
+
+ border_orig += orig_stride;
+ border_recon += recon_stride;
+ }
+ }
+
+ orig += orig_stride * 16;
+ recon += recon_stride * 16;
+ }
+
+ /* Handle odd-sized height */
+ for (; row < rows; row++) {
+ for (col = 0; col < cols; col++) {
+ diff = orig[col] - recon[col];
+ total_sse += diff * diff;
+ }
+
+ orig += orig_stride;
+ recon += recon_stride;
+ }
+
+ return total_sse;
+}
+
+
+static void generate_psnr_packet(VP9_COMP *cpi) {
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ struct vpx_codec_cx_pkt pkt;
+ uint64_t sse;
+ int i;
+ unsigned int width = cpi->common.Width;
+ unsigned int height = cpi->common.Height;
+
+ pkt.kind = VPX_CODEC_PSNR_PKT;
+ sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride,
+ width, height);
+ pkt.data.psnr.sse[0] = sse;
+ pkt.data.psnr.sse[1] = sse;
+ pkt.data.psnr.samples[0] = width * height;
+ pkt.data.psnr.samples[1] = width * height;
+
+ width = (width + 1) / 2;
+ height = (height + 1) / 2;
+
+ sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride,
+ width, height);
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[2] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[2] = width * height;
+
+ sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride,
+ width, height);
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[3] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[3] = width * height;
+
+ for (i = 0; i < 4; i++)
+ pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+ pkt.data.psnr.sse[i]);
+
+ vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ if (ref_frame_flags > 7)
+ return -1;
+
+ cpi->ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ if (ref_frame_flags > 7)
+ return -1;
+
+ cpi->common.refresh_golden_frame = 0;
+ cpi->common.refresh_alt_ref_frame = 0;
+ cpi->common.refresh_last_frame = 0;
+
+ if (ref_frame_flags & VP9_LAST_FLAG)
+ cpi->common.refresh_last_frame = 1;
+
+ if (ref_frame_flags & VP9_GOLD_FLAG)
+ cpi->common.refresh_golden_frame = 1;
+
+ if (ref_frame_flags & VP9_ALT_FLAG)
+ cpi->common.refresh_alt_ref_frame = 1;
+
+ return 0;
+}
+
+int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMMON *cm = &cpi->common;
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP9_LAST_FLAG)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP9_GOLD_FLAG)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP9_ALT_FLAG)
+ ref_fb_idx = cm->alt_fb_idx;
+ else
+ return -1;
+
+ vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);
+
+ return 0;
+}
+
+int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+ VP9_COMMON *cm = &cpi->common;
+
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP9_LAST_FLAG)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP9_GOLD_FLAG)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP9_ALT_FLAG)
+ ref_fb_idx = cm->alt_fb_idx;
+ else
+ return -1;
+
+ vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);
+
+ return 0;
+}
+int vp9_update_entropy(VP9_PTR comp, int update) {
+ VP9_COMP *cpi = (VP9_COMP *) comp;
+ VP9_COMMON *cm = &cpi->common;
+ cm->refresh_entropy_probs = update;
+
+ return 0;
+}
+
+
+#ifdef OUTPUT_YUV_SRC
+void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
+ unsigned char *src = s->y_buffer;
+ int h = s->y_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
+ YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+ unsigned char *src = s->y_buffer;
+ int h = cm->Height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = (cm->Height + 1) / 2;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = (cm->Height + 1) / 2;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
+
+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ // Update data structure that monitors level of reference to last GF
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ // this frame refreshes means next frames don't unless specified by user
+ cpi->common.frames_since_golden = 0;
+
+ // Clear the alternate reference update pending flag.
+ cpi->source_alt_ref_pending = FALSE;
+
+ // Set the alternate refernce frame active flag
+ cpi->source_alt_ref_active = TRUE;
+
+
+}
+static void update_golden_frame_stats(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ // Update the Golden frame usage counts.
+ if (cm->refresh_golden_frame) {
+ // Update data structure that monitors level of reference to last GF
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ // this frame refreshes means next frames don't unless specified by user
+ cm->refresh_golden_frame = 0;
+ cpi->common.frames_since_golden = 0;
+
+ // if ( cm->frame_type == KEY_FRAME )
+ // {
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+ // }
+ // else
+ // {
+ // // Carry a potrtion of count over to begining of next gf sequence
+ // cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
+ // cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
+ // }
+
+ // ******** Fixed Q test code only ************
+ // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+ if (cpi->oxcf.fixed_q >= 0 &&
+ cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {
+ cpi->source_alt_ref_pending = TRUE;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ }
+
+ if (!cpi->source_alt_ref_pending)
+ cpi->source_alt_ref_active = FALSE;
+
+ // Decrement count down till next gf
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ } else if (!cpi->common.refresh_alt_ref_frame) {
+ // Decrement count down till next gf
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ if (cpi->common.frames_till_alt_ref_frame)
+ cpi->common.frames_till_alt_ref_frame--;
+
+ cpi->common.frames_since_golden++;
+
+ if (cpi->common.frames_since_golden > 1) {
+ cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+ cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+ }
+ }
+}
+
+static int find_fp_qindex() {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ if (vp9_convert_qindex_to_q(i) >= 30.0) {
+ break;
+ }
+ }
+
+ if (i == QINDEX_RANGE)
+ i--;
+
+ return i;
+}
+
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+ (void) size;
+ (void) dest;
+ (void) frame_flags;
+
+
+ vp9_set_quantizer(cpi, find_fp_qindex());
+ vp9_first_pass(cpi);
+}
+
+#define WRITE_RECON_BUFFER 0
+#if WRITE_RECON_BUFFER
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
+
+ // write the frame
+ FILE *yframe;
+ int i;
+ char filename[255];
+
+ sprintf(filename, "cx\\y%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->y_height; i++)
+ fwrite(frame->y_buffer + i * frame->y_stride,
+ frame->y_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\u%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->u_buffer + i * frame->uv_stride,
+ frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\v%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->v_buffer + i * frame->uv_stride,
+ frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+}
+#endif
+
+static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
+#define EDGE_THRESH 128
+ int i, j;
+ int num_edge_pels = 0;
+ int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
+ unsigned char *prev = frame->y_buffer + 1;
+ unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;
+ unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;
+ for (i = 1; i < frame->y_height - 1; i++) {
+ for (j = 1; j < frame->y_width - 1; j++) {
+ /* Sobel hor and ver gradients */
+ int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
+ int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+ h = (h < 0 ? -h : h);
+ v = (v < 0 ? -v : v);
+ if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;
+ curr++;
+ prev++;
+ next++;
+ }
+ curr += frame->y_stride - frame->y_width + 2;
+ prev += frame->y_stride - frame->y_width + 2;
+ next += frame->y_stride - frame->y_width + 2;
+ }
+ return (double)num_edge_pels / (double)num_pels;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static BOOL recode_loop_test(VP9_COMP *cpi,
+ int high_limit, int low_limit,
+ int q, int maxq, int minq) {
+ BOOL force_recode = FALSE;
+ VP9_COMMON *cm = &cpi->common;
+
+ // Is frame recode allowed at all
+ // Yes if either recode mode 1 is selected or mode two is selcted
+ // and the frame is a key frame. golden frame or alt_ref_frame
+ if ((cpi->sf.recode_loop == 1) ||
+ ((cpi->sf.recode_loop == 2) &&
+ ((cm->frame_type == KEY_FRAME) ||
+ cm->refresh_golden_frame ||
+ cm->refresh_alt_ref_frame))) {
+ // General over and under shoot tests
+ if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+ ((cpi->projected_frame_size < low_limit) && (q > minq))) {
+ force_recode = TRUE;
+ }
+ // Special Constrained quality tests
+ else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ // Undershoot and below auto cq level
+ if ((q > cpi->cq_target_quality) &&
+ (cpi->projected_frame_size <
+ ((cpi->this_frame_target * 7) >> 3))) {
+ force_recode = TRUE;
+ }
+ // Severe undershoot and between auto and user cq level
+ else if ((q > cpi->oxcf.cq_level) &&
+ (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+ (cpi->active_best_quality > cpi->oxcf.cq_level)) {
+ force_recode = TRUE;
+ cpi->active_best_quality = cpi->oxcf.cq_level;
+ }
+ }
+ }
+
+ return force_recode;
+}
+
+static void update_reference_frames(VP9_COMMON *cm) {
+ YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+
+ // At this point the new frame has been encoded.
+ // If any buffer copy / swapping is signaled it should be done here.
+
+ if (cm->frame_type == KEY_FRAME) {
+ yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;
+
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+
+ cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+ } else { /* For non key frames */
+ if (cm->refresh_alt_ref_frame) {
+ assert(!cm->copy_buffer_to_arf);
+
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;
+ cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+ cm->alt_fb_idx = cm->new_fb_idx;
+ } else if (cm->copy_buffer_to_arf) {
+ assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+ if (cm->copy_buffer_to_arf == 1) {
+ if (cm->alt_fb_idx != cm->lst_fb_idx) {
+ yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+ cm->alt_fb_idx = cm->lst_fb_idx;
+ }
+ } else { /* if (cm->copy_buffer_to_arf == 2) */
+ if (cm->alt_fb_idx != cm->gld_fb_idx) {
+ yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;
+ cm->alt_fb_idx = cm->gld_fb_idx;
+ }
+ }
+ }
+
+ if (cm->refresh_golden_frame) {
+ assert(!cm->copy_buffer_to_gf);
+
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;
+ cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+ cm->gld_fb_idx = cm->new_fb_idx;
+ } else if (cm->copy_buffer_to_gf) {
+ assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+ if (cm->copy_buffer_to_gf == 1) {
+ if (cm->gld_fb_idx != cm->lst_fb_idx) {
+ yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+ cm->gld_fb_idx = cm->lst_fb_idx;
+ }
+ } else { /* if (cm->copy_buffer_to_gf == 2) */
+ if (cm->alt_fb_idx != cm->gld_fb_idx) {
+ yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;
+ cm->gld_fb_idx = cm->alt_fb_idx;
+ }
+ }
+ }
+ }
+
+ if (cm->refresh_last_frame) {
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;
+ cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;
+ cm->lst_fb_idx = cm->new_fb_idx;
+ }
+}
+
+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
+ if (cm->no_lpf) {
+ cm->filter_level = 0;
+ }
+#if CONFIG_LOSSLESS
+ else if (cpi->oxcf.lossless) {
+ cm->filter_level = 0;
+ }
+#endif
+ else {
+ struct vpx_usec_timer timer;
+
+ vp9_clear_system_state();
+
+ vpx_usec_timer_start(&timer);
+ if (cpi->sf.auto_filter == 0)
+ vp9_pick_filter_level_fast(cpi->Source, cpi);
+ else
+ vp9_pick_filter_level(cpi->Source, cpi);
+
+ vpx_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+ }
+
+ if (cm->filter_level > 0) {
+ vp9_set_alt_lf_level(cpi, cm->filter_level);
+ vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);
+ }
+
+ vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+}
+
+#if CONFIG_PRED_FILTER
+void select_pred_filter_mode(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ int prob_pred_filter_off = cm->prob_pred_filter_off;
+
+ // Force filter on/off if probability is extreme
+ if (prob_pred_filter_off >= 255 * 0.95)
+ cm->pred_filter_mode = 0; // Off at the frame level
+ else if (prob_pred_filter_off <= 255 * 0.05)
+ cm->pred_filter_mode = 1; // On at the frame level
+ else
+ cm->pred_filter_mode = 2; // Selectable at the MB level
+}
+
+void update_pred_filt_prob(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int prob_pred_filter_off;
+
+ // Based on the selection in the previous frame determine what mode
+ // to use for the current frame and work out the signaling probability
+ if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {
+ prob_pred_filter_off = cpi->pred_filter_off_count * 256 /
+ (cpi->pred_filter_on_count + cpi->pred_filter_off_count);
+
+ if (prob_pred_filter_off < 1)
+ prob_pred_filter_off = 1;
+
+ if (prob_pred_filter_off > 255)
+ prob_pred_filter_off = 255;
+
+ cm->prob_pred_filter_off = prob_pred_filter_off;
+ } else
+ cm->prob_pred_filter_off = 128;
+ /*
+ {
+ FILE *fp = fopen("filt_use.txt", "a");
+ fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,
+ cpi->pred_filter_on_count, cm->prob_pred_filter_off);
+ fclose(fp);
+ }
+ */
+}
+#endif
+
+static void encode_frame_to_data_rate
+(
+ VP9_COMP *cpi,
+ unsigned long *size,
+ unsigned char *dest,
+ unsigned int *frame_flags
+) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ int Q;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+
+ int Loop = FALSE;
+ int loop_count;
+ int this_q;
+ int last_zbin_oq;
+
+ int q_low;
+ int q_high;
+ int zbin_oq_high;
+ int zbin_oq_low = 0;
+
+ int top_index;
+ int bottom_index;
+ int active_worst_qchanged = FALSE;
+
+ int overshoot_seen = FALSE;
+ int undershoot_seen = FALSE;
+
+ int loop_size_estimate = 0;
+
+ SPEED_FEATURES *sf = &cpi->sf;
+#if RESET_FOREACH_FILTER
+ int q_low0;
+ int q_high0;
+ int zbin_oq_high0;
+ int zbin_oq_low0 = 0;
+ int Q0;
+ int last_zbin_oq0;
+ int active_best_quality0;
+ int active_worst_quality0;
+ double rate_correction_factor0;
+ double gf_rate_correction_factor0;
+#endif
+
+ /* list of filters to search over */
+ int mcomp_filters_to_search[] = {
+ EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
+ };
+ int mcomp_filters = sizeof(mcomp_filters_to_search) /
+ sizeof(*mcomp_filters_to_search);
+ int mcomp_filter_index = 0;
+ INT64 mcomp_filter_cost[4];
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp9_clear_system_state();
+
+
+ // For an alt ref frame in 2 pass we skip the call to the second
+ // pass function that sets the target bandwidth so must set it here
+ if (cpi->common.refresh_alt_ref_frame) {
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits; // Per frame bit target for the alt ref frame
+ cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate; // per second target bitrate
+ }
+
+ // Default turn off buffer to buffer copying
+ cm->copy_buffer_to_gf = 0;
+ cm->copy_buffer_to_arf = 0;
+
+ // Clear zbin over-quant value and mode boost values.
+ cpi->zbin_over_quant = 0;
+ cpi->zbin_mode_boost = 0;
+
+ // Enable or disable mode based tweaking of the zbin
+ // For 2 Pass Only used where GF/ARF prediction quality
+ // is above a threshold
+ cpi->zbin_mode_boost = 0;
+#if CONFIG_LOSSLESS
+ cpi->zbin_mode_boost_enabled = FALSE;
+#else
+ cpi->zbin_mode_boost_enabled = TRUE;
+#endif
+ if (cpi->gfu_boost <= 400) {
+ cpi->zbin_mode_boost_enabled = FALSE;
+ }
+
+ // Current default encoder behaviour for the altref sign bias
+ if (cpi->source_alt_ref_active)
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+ else
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+ // Check to see if a key frame is signalled
+ // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
+ if ((cm->current_video_frame == 0) ||
+ (cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
+ // Key frame from VFW/auto-keyframe/first frame
+ cm->frame_type = KEY_FRAME;
+ }
+
+ // Set default state for segment based loop filter update flags
+ xd->mode_ref_lf_delta_update = 0;
+
+ // Set various flags etc to special state if it is a key frame
+ if (cm->frame_type == KEY_FRAME) {
+ int i;
+
+ // Reset the loop filter deltas and segmentation map
+ setup_features(cpi);
+
+ // If segmentation is enabled force a map update for key frames
+ if (xd->segmentation_enabled) {
+ xd->update_mb_segmentation_map = 1;
+ xd->update_mb_segmentation_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame
+ cpi->source_alt_ref_active = FALSE;
+
+ // Reset the RD threshold multipliers to default of * 1 (128)
+ for (i = 0; i < MAX_MODES; i++) {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+ }
+
+ // Test code for new segment features
+ init_seg_features(cpi);
+
+ // Decide how big to make the frame
+ vp9_pick_frame_size(cpi);
+
+ vp9_clear_system_state();
+
+ // Set an active best quality and if necessary active worst quality
+ Q = cpi->active_worst_quality;
+
+ if (cm->frame_type == KEY_FRAME) {
+ int high = 2000;
+ int low = 400;
+
+ if (cpi->kf_boost > high)
+ cpi->active_best_quality = kf_low_motion_minq[Q];
+ else if (cpi->kf_boost < low)
+ cpi->active_best_quality = kf_high_motion_minq[Q];
+ else {
+ int gap = high - low;
+ int offset = high - cpi->kf_boost;
+ int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];
+ int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+
+ cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;
+ }
+
+ // Make an adjustment based on the %s static
+ // The main impact of this is at lower Q to prevent overly large key
+ // frames unless a lot of the image is static.
+ if (cpi->kf_zeromotion_pct < 64)
+ cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);
+
+ // Special case for key frames forced because we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping
+ if (cpi->this_key_frame_forced) {
+ int delta_qindex;
+ int qindex = cpi->last_boosted_qindex;
+
+ delta_qindex = compute_qdelta(cpi, qindex,
+ (qindex * 0.75));
+
+ cpi->active_best_quality = qindex + delta_qindex;
+ if (cpi->active_best_quality < cpi->best_quality)
+ cpi->active_best_quality = cpi->best_quality;
+ }
+ }
+
+ else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {
+ int high = 2000;
+ int low = 400;
+
+ // Use the lower of cpi->active_worst_quality and recent
+ // average Q as basis for GF/ARF Q limit unless last frame was
+ // a key frame.
+ if ((cpi->frames_since_key > 1) &&
+ (cpi->avg_frame_qindex < cpi->active_worst_quality)) {
+ Q = cpi->avg_frame_qindex;
+ }
+
+ // For constrained quality dont allow Q less than the cq level
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < cpi->cq_target_quality)) {
+ Q = cpi->cq_target_quality;
+ }
+
+ if (cpi->gfu_boost > high)
+ cpi->active_best_quality = gf_low_motion_minq[Q];
+ else if (cpi->gfu_boost < low)
+ cpi->active_best_quality = gf_high_motion_minq[Q];
+ else {
+ int gap = high - low;
+ int offset = high - cpi->gfu_boost;
+ int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];
+ int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+
+ cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;
+ }
+
+ // Constrained quality use slightly lower active best.
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ cpi->active_best_quality =
+ cpi->active_best_quality * 15 / 16;
+ }
+ } else {
+ cpi->active_best_quality = inter_minq[Q];
+
+ // For the constant/constrained quality mode we dont want
+ // q to fall below the cq level.
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (cpi->active_best_quality < cpi->cq_target_quality)) {
+ // If we are strongly undershooting the target rate in the last
+ // frames then use the user passed in cq value not the auto
+ // cq value.
+ if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
+ cpi->active_best_quality = cpi->oxcf.cq_level;
+ else
+ cpi->active_best_quality = cpi->cq_target_quality;
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ if (cpi->active_worst_quality > cpi->worst_quality)
+ cpi->active_worst_quality = cpi->worst_quality;
+
+ if (cpi->active_best_quality < cpi->best_quality)
+ cpi->active_best_quality = cpi->best_quality;
+
+ if (cpi->active_best_quality > cpi->worst_quality)
+ cpi->active_best_quality = cpi->worst_quality;
+
+ if (cpi->active_worst_quality < cpi->active_best_quality)
+ cpi->active_worst_quality = cpi->active_best_quality;
+
+ // Specuial case code to try and match quality with forced key frames
+ if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+ Q = cpi->last_boosted_qindex;
+ } else {
+ // Determine initial Q to try
+ Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ }
+ last_zbin_oq = cpi->zbin_over_quant;
+
+ // Set highest allowed value for Zbin over quant
+ if (cm->frame_type == KEY_FRAME)
+ zbin_oq_high = 0; // ZBIN_OQ_MAX/16
+ else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
+ zbin_oq_high = 16;
+ else
+ zbin_oq_high = ZBIN_OQ_MAX;
+
+ vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+
+ // Limit Q range for the adaptive loop.
+ bottom_index = cpi->active_best_quality;
+ top_index = cpi->active_worst_quality;
+ q_low = cpi->active_best_quality;
+ q_high = cpi->active_worst_quality;
+
+ loop_count = 0;
+
+ if (cm->frame_type != KEY_FRAME) {
+ /* TODO: Decide this more intelligently */
+ if (sf->search_best_filter) {
+ cm->mcomp_filter_type = mcomp_filters_to_search[0];
+ mcomp_filter_index = 0;
+ } else {
+ cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
+ }
+ /* TODO: Decide this more intelligently */
+ xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);
+ }
+
+#if CONFIG_POSTPROC
+
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ unsigned char *src;
+ int l = 0;
+
+ switch (cpi->oxcf.noise_sensitivity) {
+ case 1:
+ l = 20;
+ break;
+ case 2:
+ l = 40;
+ break;
+ case 3:
+ l = 60;
+ break;
+ case 4:
+
+ case 5:
+ l = 100;
+ break;
+ case 6:
+ l = 150;
+ break;
+ }
+
+
+ if (cm->frame_type == KEY_FRAME) {
+ vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0, RTCD(postproc));
+ } else {
+ vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0, RTCD(postproc));
+
+ src = cpi->Source->y_buffer;
+
+ if (cpi->Source->y_stride < 0) {
+ src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+ }
+ }
+ }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+ vp9_write_yuv_frame(cpi->Source);
+#endif
+
+#if RESET_FOREACH_FILTER
+ if (sf->search_best_filter) {
+ q_low0 = q_low;
+ q_high0 = q_high;
+ Q0 = Q;
+ zbin_oq_low0 = zbin_oq_low;
+ zbin_oq_high0 = zbin_oq_high;
+ last_zbin_oq0 = last_zbin_oq;
+ rate_correction_factor0 = cpi->rate_correction_factor;
+ gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;
+ active_best_quality0 = cpi->active_best_quality;
+ active_worst_quality0 = cpi->active_worst_quality;
+ }
+#endif
+ do {
+ vp9_clear_system_state(); // __asm emms;
+
+ vp9_set_quantizer(cpi, Q);
+ this_q = Q;
+
+ if (loop_count == 0) {
+
+ // setup skip prob for costing in mode/mv decision
+ if (cpi->common.mb_no_coeff_skip) {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; k++)
+ cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
+
+ if (cm->frame_type != KEY_FRAME) {
+ if (cpi->common.refresh_alt_ref_frame) {
+ for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+ if (cpi->last_skip_false_probs[2][k] != 0)
+ cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
+ }
+ } else if (cpi->common.refresh_golden_frame) {
+ for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+ if (cpi->last_skip_false_probs[1][k] != 0)
+ cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
+ }
+ } else {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; k++) {
+ if (cpi->last_skip_false_probs[0][k] != 0)
+ cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];
+ }
+ }
+
+ // as this is for cost estimate, let's make sure it does not
+ // get extreme either way
+ {
+ int k;
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+ if (cm->mbskip_pred_probs[k] < 5)
+ cm->mbskip_pred_probs[k] = 5;
+
+ if (cm->mbskip_pred_probs[k] > 250)
+ cm->mbskip_pred_probs[k] = 250;
+
+ if (cpi->is_src_frame_alt_ref)
+ cm->mbskip_pred_probs[k] = 1;
+ }
+ }
+ }
+ }
+
+ // Set up entropy depending on frame type.
+ if (cm->frame_type == KEY_FRAME)
+ vp9_setup_key_frame(cpi);
+ else
+ vp9_setup_inter_frame(cpi);
+ }
+
+ // transform / motion compensation build reconstruction frame
+
+ vp9_encode_frame(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ update_base_skip_probs(cpi);
+
+ vp9_clear_system_state(); // __asm emms;
+
+#if CONFIG_PRED_FILTER
+ // Update prediction filter on/off probability based on
+ // selection made for the current frame
+ if (cm->frame_type != KEY_FRAME)
+ update_pred_filt_prob(cpi);
+#endif
+
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ vp9_save_coding_context(cpi);
+ cpi->dummy_packing = 1;
+ vp9_pack_bitstream(cpi, dest, size);
+ cpi->projected_frame_size = (*size) << 3;
+ vp9_restore_coding_context(cpi);
+
+ if (frame_over_shoot_limit == 0)
+ frame_over_shoot_limit = 1;
+ active_worst_qchanged = FALSE;
+
+ // Special case handling for forced key frames
+ if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+ int last_q = Q;
+ int kf_err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+
+ int high_err_target = cpi->ambient_err;
+ int low_err_target = (cpi->ambient_err >> 1);
+
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += (!kf_err);
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if (((kf_err > high_err_target) &&
+ (cpi->projected_frame_size <= frame_over_shoot_limit)) ||
+ ((kf_err > low_err_target) &&
+ (cpi->projected_frame_size <= frame_under_shoot_limit))) {
+ // Lower q_high
+ q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+ // Adjust Q
+ Q = (Q * high_err_target) / kf_err;
+ if (Q < ((q_high + q_low) >> 1))
+ Q = (q_high + q_low) >> 1;
+ }
+ // The key frame is much better than the previous frame
+ else if ((kf_err < low_err_target) &&
+ (cpi->projected_frame_size >= frame_under_shoot_limit)) {
+ // Raise q_low
+ q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+ // Adjust Q
+ Q = (Q * low_err_target) / kf_err;
+ if (Q > ((q_high + q_low + 1) >> 1))
+ Q = (q_high + q_low + 1) >> 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ if (Q > q_high)
+ Q = q_high;
+ else if (Q < q_low)
+ Q = q_low;
+
+ Loop = ((Q != last_q)) ? TRUE : FALSE;
+ }
+
+ // Is the projected frame size out of range and are we allowed to attempt to recode.
+ else if (recode_loop_test(cpi,
+ frame_over_shoot_limit, frame_under_shoot_limit,
+ Q, top_index, bottom_index)) {
+ int last_q = Q;
+ int Retries = 0;
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+
+ // Frame is too large
+ if (cpi->projected_frame_size > cpi->this_frame_target) {
+ q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+
+ if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+ if (undershoot_seen || (loop_count > 1)) {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low + 1) / 2;
+
+ // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else {
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ }
+ } else {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 0);
+
+ Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+ while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {
+ vp9_update_rate_correction_factors(cpi, 0);
+ Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ Retries++;
+ }
+ }
+
+ overshoot_seen = TRUE;
+ }
+ // Frame is too small
+ else {
+ if (cpi->zbin_over_quant == 0)
+ q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
+ else // else lower zbin_oq_high
+ zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+ if (overshoot_seen || (loop_count > 1)) {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low) / 2;
+
+ // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ } else {
+ // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 0);
+
+ Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < q_low)) {
+ q_low = Q;
+ }
+
+ while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {
+ vp9_update_rate_correction_factors(cpi, 0);
+ Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ Retries++;
+ }
+ }
+
+ undershoot_seen = TRUE;
+ }
+
+ // Clamp Q to upper and lower limits:
+ if (Q > q_high)
+ Q = q_high;
+ else if (Q < q_low)
+ Q = q_low;
+
+ // Clamp cpi->zbin_over_quant
+ cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?
+ zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?
+ zbin_oq_high : cpi->zbin_over_quant;
+
+ // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
+ Loop = ((Q != last_q)) ? TRUE : FALSE;
+ last_zbin_oq = cpi->zbin_over_quant;
+ } else
+ Loop = FALSE;
+
+ if (cpi->is_src_frame_alt_ref)
+ Loop = FALSE;
+
+ if (cm->frame_type != KEY_FRAME &&
+ !sf->search_best_filter &&
+ cm->mcomp_filter_type == SWITCHABLE) {
+ int interp_factor = Q / 3; /* denominator is 256 */
+ int count[VP9_SWITCHABLE_FILTERS];
+ int tot_count = 0, c = 0, thr;
+ int i, j;
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ count[i] = 0;
+ for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+ count[i] += cpi->switchable_interp_count[j][i];
+ }
+ tot_count += count[i];
+ }
+
+ thr = ((tot_count * interp_factor + 128) >> 8);
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ c += (count[i] >= thr);
+ }
+ if (c == 1) {
+ /* Mostly one filter is used. So set the filter at frame level */
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ cm->mcomp_filter_type = vp9_switchable_interp[i];
+ Loop = TRUE; /* Make sure to loop since the filter changed */
+ break;
+ }
+ }
+ }
+ }
+
+ if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
+ if (mcomp_filter_index < mcomp_filters) {
+ INT64 err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+ INT64 rate = cpi->projected_frame_size << 8;
+ mcomp_filter_cost[mcomp_filter_index] =
+ (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
+ mcomp_filter_index++;
+ if (mcomp_filter_index < mcomp_filters) {
+ cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
+ loop_count = -1;
+ Loop = TRUE;
+ } else {
+ int f;
+ INT64 best_cost = mcomp_filter_cost[0];
+ int mcomp_best_filter = mcomp_filters_to_search[0];
+ for (f = 1; f < mcomp_filters; f++) {
+ if (mcomp_filter_cost[f] < best_cost) {
+ mcomp_best_filter = mcomp_filters_to_search[f];
+ best_cost = mcomp_filter_cost[f];
+ }
+ }
+ if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
+ loop_count = -1;
+ Loop = TRUE;
+ cm->mcomp_filter_type = mcomp_best_filter;
+ }
+ /*
+ printf(" best filter = %d, ( ", mcomp_best_filter);
+ for (f=0;f<mcomp_filters; f++) printf("%d ", mcomp_filter_cost[f]);
+ printf(")\n");
+ */
+ }
+#if RESET_FOREACH_FILTER
+ if (Loop == TRUE) {
+ overshoot_seen = FALSE;
+ undershoot_seen = FALSE;
+ zbin_oq_low = zbin_oq_low0;
+ zbin_oq_high = zbin_oq_high0;
+ q_low = q_low0;
+ q_high = q_high0;
+ Q = Q0;
+ cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;
+ cpi->rate_correction_factor = rate_correction_factor0;
+ cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
+ cpi->active_best_quality = active_best_quality0;
+ cpi->active_worst_quality = active_worst_quality0;
+ }
+#endif
+ }
+ }
+
+ if (Loop == TRUE) {
+ loop_count++;
+#if CONFIG_INTERNAL_STATS
+ cpi->tot_recode_hits++;
+#endif
+ }
+ } while (Loop == TRUE);
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
+ cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+ }
+
+ // This frame's MVs are saved and will be used in next frame's MV
+ // prediction. Last frame has one more line(add to bottom) and one
+ // more column(add to right) than cm->mip. The edge elements are
+ // initialized to 0.
+ if (cm->show_frame) { // do not save for altref frame
+ int mb_row;
+ int mb_col;
+ MODE_INFO *tmp = cm->mip;
+
+ if (cm->frame_type != KEY_FRAME) {
+ for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {
+ for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {
+ if (tmp->mbmi.ref_frame != INTRA_FRAME)
+ cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;
+
+ cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+ cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;
+ tmp++;
+ }
+ }
+ }
+ }
+
+ // Update the GF useage maps.
+ // This is done after completing the compression of a frame when all modes
+ // etc. are finalized but before loop filter
+ vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->refresh_last_frame = 1;
+
+#if 0
+ {
+ FILE *f = fopen("gfactive.stt", "a");
+ fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+ fclose(f);
+ }
+#endif
+
+ cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+#if WRITE_RECON_BUFFER
+ if (cm->show_frame)
+ write_cx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame);
+ else
+ write_cx_frame_to_file(cm->frame_to_show,
+ cm->current_video_frame + 1000);
+#endif
+
+ // Pick the loop filter level for the frame.
+ loopfilter_frame(cpi, cm);
+
+ // build the bitstream
+ cpi->dummy_packing = 0;
+ vp9_pack_bitstream(cpi, dest, size);
+
+ if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+ update_reference_segmentation_map(cpi);
+ }
+
+#if CONFIG_PRED_FILTER
+ // Select the prediction filtering mode to use for the
+ // next frame based on the current frame selections
+ if (cm->frame_type != KEY_FRAME)
+ select_pred_filter_mode(cpi);
+#endif
+
+ update_reference_frames(cm);
+ vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
+ vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);
+ vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
+ vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);
+ vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
+ vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
+ cpi->hybrid_coef_counts_16x16);
+ vp9_adapt_coef_probs(&cpi->common);
+ if (cpi->common.frame_type != KEY_FRAME) {
+ vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
+ vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
+ vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
+ vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
+ vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
+ vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
+ vp9_adapt_mode_probs(&cpi->common);
+
+ cpi->common.fc.NMVcount = cpi->NMVcount;
+ vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+ vp9_update_mode_context(&cpi->common);
+ }
+
+ /* Move storing frame_type out of the above loop since it is also
+ * needed in motion search besides loopfilter */
+ cm->last_frame_type = cm->frame_type;
+
+ // Keep a copy of the size estimate used in the loop
+ loop_size_estimate = cpi->projected_frame_size;
+
+ // Update rate control heuristics
+ cpi->total_byte_count += (*size);
+ cpi->projected_frame_size = (*size) << 3;
+
+ if (!active_worst_qchanged)
+ vp9_update_rate_correction_factors(cpi, 2);
+
+ cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+ // Keep record of last boosted (KF/KF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((cm->base_qindex < cpi->last_boosted_qindex) ||
+ ((cpi->static_mb_pct < 100) &&
+ ((cm->frame_type == KEY_FRAME) ||
+ cm->refresh_alt_ref_frame ||
+ (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+ cpi->last_boosted_qindex = cm->base_qindex;
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ vp9_adjust_key_frame_context(cpi);
+ }
+
+ // Keep a record of ambient average Q.
+ if (cm->frame_type != KEY_FRAME)
+ cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+ // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+ if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {
+ cpi->ni_frames++;
+ cpi->tot_q += vp9_convert_qindex_to_q(Q);
+ cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
+
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+ }
+
+ // Update the buffer level variable.
+ // Non-viewable frames are a special case and are treated as pure overhead.
+ if (!cm->show_frame)
+ cpi->bits_off_target -= cpi->projected_frame_size;
+ else
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+ // Clip the buffer level at the maximum buffer size
+ if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+ cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+ // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
+ cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+ cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+ cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+ cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+ // Actual bits spent
+ cpi->total_actual_bits += cpi->projected_frame_size;
+
+ // Debug stats
+ cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+ cpi->buffer_level = cpi->bits_off_target;
+
+ // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->twopass.kf_group_bits < 0)
+ cpi->twopass.kf_group_bits = 0;
+ } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {
+ cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0;
+ }
+
+ // Update the skip mb flag probabilities based on the distribution seen
+ // in this frame.
+ update_base_skip_probs(cpi);
+
+#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS
+ {
+ FILE *f = fopen("mv_ref_dist.stt", "a");
+ unsigned int i;
+ for (i = 0; i < MAX_MV_REFS; ++i) {
+ fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);
+ }
+ fprintf(f, "\n" );
+
+ fclose(f);
+ }
+#endif
+
+#if 0// 1 && CONFIG_INTERNAL_STATS
+ {
+ FILE *f = fopen("tmp.stt", "a");
+ int recon_err;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ recon_err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+
+ if (cpi->twopass.total_left_stats->coded_error != 0.0)
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+ "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+ "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%10.3f %8d %10d %10d %10d\n",
+ cpi->common.current_video_frame, cpi->this_frame_target,
+ cpi->projected_frame_size, loop_size_estimate,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+ (int)cpi->total_actual_bits,
+ vp9_convert_qindex_to_q(cm->base_qindex),
+ (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+ vp9_convert_qindex_to_q(cpi->active_best_quality),
+ vp9_convert_qindex_to_q(cpi->active_worst_quality),
+ cpi->avg_q,
+ vp9_convert_qindex_to_q(cpi->ni_av_qi),
+ vp9_convert_qindex_to_q(cpi->cq_target_quality),
+ cpi->zbin_over_quant,
+ // cpi->avg_frame_qindex, cpi->zbin_over_quant,
+ cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+ cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor,
+ (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats->coded_error,
+ (double)cpi->twopass.bits_left /
+ cpi->twopass.total_left_stats->coded_error,
+ cpi->tot_recode_hits, recon_err, cpi->kf_boost,
+ cpi->kf_zeromotion_pct);
+ else
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
+ "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+ "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%8d %10d %10d %10d\n",
+ cpi->common.current_video_frame,
+ cpi->this_frame_target, cpi->projected_frame_size,
+ loop_size_estimate,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+ (int)cpi->total_actual_bits,
+ vp9_convert_qindex_to_q(cm->base_qindex),
+ (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+ vp9_convert_qindex_to_q(cpi->active_best_quality),
+ vp9_convert_qindex_to_q(cpi->active_worst_quality),
+ cpi->avg_q,
+ vp9_convert_qindex_to_q(cpi->ni_av_qi),
+ vp9_convert_qindex_to_q(cpi->cq_target_quality),
+ cpi->zbin_over_quant,
+ // cpi->avg_frame_qindex, cpi->zbin_over_quant,
+ cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+ cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor,
+ (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats->coded_error,
+ cpi->tot_recode_hits, recon_err, cpi->kf_boost,
+ cpi->kf_zeromotion_pct);
+
+ fclose(f);
+
+ if (0) {
+ FILE *fmodes = fopen("Modes.stt", "a");
+ int i;
+
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+ cpi->common.current_video_frame,
+ cm->frame_type, cm->refresh_golden_frame,
+ cm->refresh_alt_ref_frame);
+
+ for (i = 0; i < MAX_MODES; i++)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
+ }
+ }
+
+#endif
+
+#if 0
+ // Debug stats for segment feature experiments.
+ print_seg_map(cpi);
+#endif
+
+ // If this was a kf or Gf note the Q
+ if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+ cm->last_kf_gf_q = cm->base_qindex;
+
+ if (cm->refresh_golden_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+ if (cm->refresh_alt_ref_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+ if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+ cpi->gold_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+ cpi->gold_is_last = 0;
+
+ if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+ cpi->alt_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+ cpi->alt_is_last = 0;
+
+ if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+ cpi->gold_is_alt = 1;
+ else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+ cpi->gold_is_alt = 0;
+
+ cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+ if (cpi->gold_is_last)
+ cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
+ if (cpi->alt_is_last)
+ cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+
+ if (cpi->gold_is_alt)
+ cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+
+ if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+
+ if (cm->frame_type == KEY_FRAME) {
+ // Tell the caller that the frame was coded as a key frame
+ *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+ // As this frame is a key frame the next defaults to an inter frame.
+ cm->frame_type = INTER_FRAME;
+ } else {
+ *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+ }
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ xd->mode_ref_lf_delta_update = 0;
+
+
+ // Dont increment frame counters if this was an altref buffer update not a real frame
+ if (cm->show_frame) {
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+ }
+
+ // reset to normal state now that we are done.
+
+
+
+#if 0
+ {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+ recon_file = fopen(filename, "wb");
+ fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
+ cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+#endif
+#ifdef OUTPUT_YUV_REC
+ vp9_write_yuv_rec_frame(cm);
+#endif
+
+ if (cm->show_frame) {
+ vpx_memcpy(cm->prev_mip, cm->mip,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+ } else {
+ vpx_memset(cm->prev_mip, 0,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+ }
+}
+
+static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
+ unsigned char *dest, unsigned int *frame_flags) {
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ vp9_second_pass(cpi);
+
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+ cpi->twopass.bits_left -= 8 * *size;
+
+ if (!cpi->common.refresh_alt_ref_frame) {
+ double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+ * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ if (two_pass_min_rate < lower_bounds_min_rate)
+ two_pass_min_rate = lower_bounds_min_rate;
+
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+ }
+}
+
+// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+#if HAVE_ARMV7
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+#endif
+
+
+int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+#if HAVE_ARMV7
+ int64_t store_reg[8];
+#endif
+ VP9_COMP *cpi = (VP9_COMP *) ptr;
+ VP9_COMMON *cm = &cpi->common;
+ struct vpx_usec_timer timer;
+ int res = 0;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_push_neon(store_reg);
+ }
+#endif
+
+ vpx_usec_timer_start(&timer);
+ if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
+ cpi->active_map_enabled ? cpi->active_map : NULL))
+ res = -1;
+ cm->clr_type = sd->clrtype;
+ vpx_usec_timer_mark(&timer);
+ cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(store_reg);
+ }
+#endif
+
+ return res;
+}
+
+
+static int frame_is_reference(const VP9_COMP *cpi) {
+ const VP9_COMMON *cm = &cpi->common;
+ const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
+ || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
+ || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+ || cm->refresh_entropy_probs
+ || xd->mode_ref_lf_delta_update
+ || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+}
+
+
+int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
+ unsigned long *size, unsigned char *dest,
+ int64_t *time_stamp, int64_t *time_end, int flush) {
+#if HAVE_ARMV7
+ int64_t store_reg[8];
+#endif
+ VP9_COMP *cpi = (VP9_COMP *) ptr;
+ VP9_COMMON *cm = &cpi->common;
+ struct vpx_usec_timer cmptimer;
+ YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+
+ if (!cpi)
+ return -1;
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_push_neon(store_reg);
+ }
+#endif
+
+ vpx_usec_timer_start(&cmptimer);
+
+ cpi->source = NULL;
+
+ cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
+ // Should we code an alternate reference frame
+ if (cpi->oxcf.play_alternate &&
+ cpi->source_alt_ref_pending) {
+ if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,
+ cpi->frames_till_gf_update_due))) {
+ cpi->alt_ref_source = cpi->source;
+ if (cpi->oxcf.arnr_max_frames > 0) {
+ vp9_temporal_filter_prepare_c(cpi,
+ cpi->frames_till_gf_update_due);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+ cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+ cm->refresh_alt_ref_frame = 1;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ cm->show_frame = 0;
+ cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag.
+ cpi->is_src_frame_alt_ref = 0;
+ }
+ }
+
+ if (!cpi->source) {
+ if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
+ cm->show_frame = 1;
+
+ cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+ && (cpi->source == cpi->alt_ref_source);
+
+ if (cpi->is_src_frame_alt_ref)
+ cpi->alt_ref_source = NULL;
+ }
+ }
+
+ if (cpi->source) {
+ cpi->un_scaled_source =
+ cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+ *time_stamp = cpi->source->ts_start;
+ *time_end = cpi->source->ts_end;
+ *frame_flags = cpi->source->flags;
+ } else {
+ *size = 0;
+ if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
+ vp9_end_first_pass(cpi); /* get last stats packet */
+ cpi->twopass.first_pass_done = 1;
+ }
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(store_reg);
+ }
+#endif
+ return -1;
+ }
+
+ if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+ }
+
+ // adjust frame rates based on timestamps given
+ if (!cm->refresh_alt_ref_frame) {
+ int64_t this_duration;
+ int step = 0;
+
+ if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = cpi->source->ts_end - cpi->source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration;
+
+ this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+ last_duration = cpi->last_end_time_stamp_seen
+ - cpi->last_time_stamp_seen;
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = ((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step)
+ vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
+ else {
+ double avg_duration, interval;
+
+ /* Average this frame's rate into the last second's average
+ * frame rate. If we haven't seen 1 second yet, then average
+ * over the whole interval seen.
+ */
+ interval = cpi->source->ts_end - cpi->first_time_stamp_ever;
+ if (interval > 10000000.0)
+ interval = 10000000;
+
+ avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+
+ cpi->last_time_stamp_seen = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+ }
+
+ // start with a 0 size frame
+ *size = 0;
+
+ // Clear down mmx registers
+ vp9_clear_system_state(); // __asm emms;
+
+ cm->frame_type = INTER_FRAME;
+ cm->frame_flags = *frame_flags;
+
+#if 0
+
+ if (cm->refresh_alt_ref_frame) {
+ // cm->refresh_golden_frame = 1;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ } else {
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ }
+
+#endif
+ /* find a free buffer for the new frame */
+ {
+ int i = 0;
+ for (; i < NUM_YV12_BUFFERS; i++) {
+ if (!cm->yv12_fb[i].flags) {
+ cm->new_fb_idx = i;
+ break;
+ }
+ }
+
+ assert(i < NUM_YV12_BUFFERS);
+ }
+ if (cpi->pass == 1) {
+ Pass1Encode(cpi, size, dest, frame_flags);
+ } else if (cpi->pass == 2) {
+ Pass2Encode(cpi, size, dest, frame_flags);
+ } else {
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+ }
+
+ if (cm->refresh_entropy_probs) {
+ if (cm->refresh_alt_ref_frame)
+ vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));
+ else
+ vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+ }
+
+ // if its a dropped frame honor the requests on subsequent frames
+ if (*size > 0) {
+ cpi->droppable = !frame_is_reference(cpi);
+
+ // return to normal state
+ cm->refresh_entropy_probs = 1;
+ cm->refresh_alt_ref_frame = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->frame_type = INTER_FRAME;
+
+ }
+
+ vpx_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
+ generate_psnr_packet(cpi);
+ }
+
+#if CONFIG_INTERNAL_STATS
+
+ if (cpi->pass != 1) {
+ cpi->bytes += *size;
+
+ if (cm->show_frame) {
+
+ cpi->count++;
+
+ if (cpi->b_calculate_psnr) {
+ double ye, ue, ve;
+ double frame_psnr;
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+ int y_samples = orig->y_height * orig->y_width;
+ int uv_samples = orig->uv_height * orig->uv_width;
+ int t_samples = y_samples + 2 * uv_samples;
+ int64_t sq_error;
+
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride, orig->y_width,
+ orig->y_height);
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride, orig->uv_width,
+ orig->uv_height);
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride, orig->uv_width,
+ orig->uv_height);
+
+ sq_error = ye + ue + ve;
+
+ frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
+
+ cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
+ cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
+ cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
+ cpi->total_sq_error += sq_error;
+ cpi->total += frame_psnr;
+ {
+ double frame_psnr2, frame_ssim2 = 0;
+ double weight = 0;
+#if CONFIG_POSTPROC
+ vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
+#endif
+ vp9_clear_system_state();
+
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ pp->y_buffer, pp->y_stride, orig->y_width,
+ orig->y_height);
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ pp->u_buffer, pp->uv_stride, orig->uv_width,
+ orig->uv_height);
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ pp->v_buffer, pp->uv_stride, orig->uv_width,
+ orig->uv_height);
+
+ sq_error = ye + ue + ve;
+
+ frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
+
+ cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
+ cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
+ cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
+ cpi->total_sq_error2 += sq_error;
+ cpi->totalp += frame_psnr2;
+
+ frame_ssim2 = vp9_calc_ssim(cpi->Source,
+ &cm->post_proc_buffer, 1, &weight);
+
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+#if 0
+ {
+ FILE *f = fopen("q_used.stt", "a");
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cpi->common.current_video_frame, y2, u2, v2,
+ frame_psnr2, frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ }
+
+ if (cpi->b_calculate_ssimg) {
+ double y, u, v, frame_all;
+ frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
+ &y, &u, &v);
+ cpi->total_ssimg_y += y;
+ cpi->total_ssimg_u += u;
+ cpi->total_ssimg_v += v;
+ cpi->total_ssimg_all += frame_all;
+ }
+
+ }
+ }
+
+#endif
+
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp9_pop_neon(store_reg);
+ }
+#endif
+
+ return 0;
+}
+
+int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *flags) {
+ VP9_COMP *cpi = (VP9_COMP *) comp;
+
+ if (cpi->common.refresh_alt_ref_frame)
+ return -1;
+ else {
+ int ret;
+#if CONFIG_POSTPROC
+ ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+#else
+
+ if (cpi->common.frame_to_show) {
+ *dest = *cpi->common.frame_to_show;
+ dest->y_width = cpi->common.Width;
+ dest->y_height = cpi->common.Height;
+ dest->uv_height = cpi->common.Height / 2;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+
+#endif // !CONFIG_POSTPROC
+ vp9_clear_system_state();
+ return ret;
+ }
+}
+
+int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
+ unsigned int cols, int delta_q[4], int delta_lf[4],
+ unsigned int threshold[4]) {
+ VP9_COMP *cpi = (VP9_COMP *) comp;
+ signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ int i;
+
+ if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+ return -1;
+
+ if (!map) {
+ vp9_disable_segmentation((VP9_PTR)cpi);
+ return 0;
+ }
+
+ // Set the segmentation Map
+ vp9_set_segmentation_map((VP9_PTR)cpi, map);
+
+ // Activate segmentation.
+ vp9_enable_segmentation((VP9_PTR)cpi);
+
+ // Set up the quant segment data
+ feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
+ feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
+ feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
+ feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
+
+ // Set up the loop segment data s
+ feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
+ feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
+ feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
+ feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
+
+ cpi->segment_encode_breakout[0] = threshold[0];
+ cpi->segment_encode_breakout[1] = threshold[1];
+ cpi->segment_encode_breakout[2] = threshold[2];
+ cpi->segment_encode_breakout[3] = threshold[3];
+
+ // Enable the loop and quant changes in the feature mask
+ for (i = 0; i < 4; i++) {
+ if (delta_q[i])
+ vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
+ else
+ vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
+
+ if (delta_lf[i])
+ vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
+ else
+ vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
+ }
+
+ // Initialise the feature data structure
+ // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
+ vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ return 0;
+}
+
+int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
+ unsigned int rows, unsigned int cols) {
+ VP9_COMP *cpi = (VP9_COMP *) comp;
+
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+ if (map) {
+ vpx_memcpy(cpi->active_map, map, rows * cols);
+ cpi->active_map_enabled = 1;
+ } else
+ cpi->active_map_enabled = 0;
+
+ return 0;
+ } else {
+ // cpi->active_map_enabled = 0;
+ return -1;
+ }
+}
+
+int vp9_set_internal_size(VP9_PTR comp,
+ VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
+ VP9_COMP *cpi = (VP9_COMP *) comp;
+
+ if (horiz_mode <= ONETWO)
+ cpi->common.horiz_scale = horiz_mode;
+ else
+ return -1;
+
+ if (vert_mode <= ONETWO)
+ cpi->common.vert_scale = vert_mode;
+ else
+ return -1;
+
+ return 0;
+}
+
+
+
+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
+ int i, j;
+ int Total = 0;
+
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ // Loop through the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < source->y_height; i += 16) {
+ for (j = 0; j < source->y_width; j += 16) {
+ unsigned int sse;
+ Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+ &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+
+int vp9_get_quantizer(VP9_PTR c) {
+ VP9_COMP *cpi = (VP9_COMP *) c;
+ return cpi->common.base_qindex;
+}
--- /dev/null
+++ b/vp9/encoder/onyx_int.h
@@ -1,0 +1,788 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ONYX_INT_H
+#define __INC_ONYX_INT_H
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "vp9/common/onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "vp9/common/onyxc_int.h"
+#include "variance.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp9/common/entropy.h"
+#include "vp9/common/entropymode.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "mcomp.h"
+#include "temporal_filter.h"
+#include "vp9/common/findnearmv.h"
+#include "lookahead.h"
+
+// #define SPEEDSTATS 1
+#define MIN_GF_INTERVAL 4
+#define DEFAULT_GF_INTERVAL 7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS 25
+
+#define AF_THRESH 25
+#define AF_THRESH2 100
+#define ARF_DECAY_THRESH 12
+
+#if CONFIG_PRED_FILTER
+#define MAX_MODES 54
+#else // CONFIG_PRED_FILTER
+#define MAX_MODES 42
+#endif // CONFIG_PRED_FILTER
+
+#define MIN_THRESHMULT 32
+#define MAX_THRESHMULT 512
+
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
+#define MV_ZBIN_BOOST 4
+#define ZBIN_OQ_MAX 192
+
+#define VP9_TEMPORAL_ALT_REF 1
+
+typedef struct {
+ nmv_context nmvc;
+ int nmvjointcost[MV_JOINTS];
+ int nmvcosts[2][MV_VALS];
+ int nmvcosts_hp[2][MV_VALS];
+
+#ifdef MODE_STATS
+ // Stats
+ int y_modes[VP9_YMODES];
+ int uv_modes[VP9_UV_MODES];
+ int i8x8_modes[VP9_I8X8_MODES];
+ int b_modes[B_MODE_COUNT];
+ int inter_y_modes[MB_MODE_COUNT];
+ int inter_uv_modes[VP9_UV_MODES];
+ int inter_b_modes[B_MODE_COUNT];
+#endif
+
+ vp9_prob segment_pred_probs[PREDICTION_PROBS];
+ unsigned char ref_pred_probs_update[PREDICTION_PROBS];
+ vp9_prob ref_pred_probs[PREDICTION_PROBS];
+ vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
+
+ unsigned char *last_frame_seg_map_copy;
+
+ // 0 = Intra, Last, GF, ARF
+ signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+ // 0 = BPRED, ZERO_MV, MV, SPLIT
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+ vp9_prob coef_probs[BLOCK_TYPES]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs[BLOCK_TYPES]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+ vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+ vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+ vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
+ [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+
+ vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */
+ vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];
+ vp9_prob bmode_prob [VP9_BINTRAMODES - 1];
+ vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];
+ vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
+ vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];
+
+ vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS - 1];
+
+ int mv_ref_ct[6][4][2];
+ int mode_context[6][4];
+ int mv_ref_ct_a[6][4][2];
+ int mode_context_a[6][4];
+
+} CODING_CONTEXT;
+
+typedef struct {
+ double frame;
+ double intra_error;
+ double coded_error;
+ double sr_coded_error;
+ double ssim_weighted_pred_err;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double new_mv_count;
+ double duration;
+ double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct {
+ int frames_so_far;
+ double frame_intra_error;
+ double frame_coded_error;
+ double frame_pcnt_inter;
+ double frame_pcnt_motion;
+ double frame_mvr;
+ double frame_mvr_abs;
+ double frame_mvc;
+ double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+typedef struct {
+ struct {
+ int err;
+ union {
+ int_mv mv;
+ MB_PREDICTION_MODE mode;
+ } m;
+ } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+ MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+#if CONFIG_PRED_FILTER
+typedef enum {
+ THR_ZEROMV,
+ THR_ZEROMV_FILT,
+ THR_DC,
+
+ THR_NEARESTMV,
+ THR_NEARESTMV_FILT,
+ THR_NEARMV,
+ THR_NEARMV_FILT,
+
+ THR_ZEROG,
+ THR_ZEROG_FILT,
+ THR_NEARESTG,
+ THR_NEARESTG_FILT,
+
+ THR_ZEROA,
+ THR_ZEROA_FILT,
+ THR_NEARESTA,
+ THR_NEARESTA_FILT,
+
+ THR_NEARG,
+ THR_NEARG_FILT,
+ THR_NEARA,
+ THR_NEARA_FILT,
+
+ THR_V_PRED,
+ THR_H_PRED,
+ THR_D45_PRED,
+ THR_D135_PRED,
+ THR_D117_PRED,
+ THR_D153_PRED,
+ THR_D27_PRED,
+ THR_D63_PRED,
+ THR_TM,
+
+ THR_NEWMV,
+ THR_NEWMV_FILT,
+ THR_NEWG,
+ THR_NEWG_FILT,
+ THR_NEWA,
+ THR_NEWA_FILT,
+
+ THR_SPLITMV,
+ THR_SPLITG,
+ THR_SPLITA,
+
+ THR_B_PRED,
+ THR_I8X8_PRED,
+
+ THR_COMP_ZEROLG,
+ THR_COMP_NEARESTLG,
+ THR_COMP_NEARLG,
+
+ THR_COMP_ZEROLA,
+ THR_COMP_NEARESTLA,
+ THR_COMP_NEARLA,
+
+ THR_COMP_ZEROGA,
+ THR_COMP_NEARESTGA,
+ THR_COMP_NEARGA,
+
+ THR_COMP_NEWLG,
+ THR_COMP_NEWLA,
+ THR_COMP_NEWGA,
+
+ THR_COMP_SPLITLG,
+ THR_COMP_SPLITLA,
+ THR_COMP_SPLITGA,
+}
+THR_MODES;
+#else
+typedef enum {
+ THR_ZEROMV,
+ THR_DC,
+
+ THR_NEARESTMV,
+ THR_NEARMV,
+
+ THR_ZEROG,
+ THR_NEARESTG,
+
+ THR_ZEROA,
+ THR_NEARESTA,
+
+ THR_NEARG,
+ THR_NEARA,
+
+ THR_V_PRED,
+ THR_H_PRED,
+ THR_D45_PRED,
+ THR_D135_PRED,
+ THR_D117_PRED,
+ THR_D153_PRED,
+ THR_D27_PRED,
+ THR_D63_PRED,
+ THR_TM,
+
+ THR_NEWMV,
+ THR_NEWG,
+ THR_NEWA,
+
+ THR_SPLITMV,
+ THR_SPLITG,
+ THR_SPLITA,
+
+ THR_B_PRED,
+ THR_I8X8_PRED,
+
+ THR_COMP_ZEROLG,
+ THR_COMP_NEARESTLG,
+ THR_COMP_NEARLG,
+
+ THR_COMP_ZEROLA,
+ THR_COMP_NEARESTLA,
+ THR_COMP_NEARLA,
+
+ THR_COMP_ZEROGA,
+ THR_COMP_NEARESTGA,
+ THR_COMP_NEARGA,
+
+ THR_COMP_NEWLG,
+ THR_COMP_NEWLA,
+ THR_COMP_NEWGA,
+
+ THR_COMP_SPLITLG,
+ THR_COMP_SPLITLA,
+ THR_COMP_SPLITGA
+}
+THR_MODES;
+#endif
+
+typedef enum {
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2
+} SEARCH_METHODS;
+
+typedef struct {
+ int RD;
+ SEARCH_METHODS search_method;
+ int improved_dct;
+ int auto_filter;
+ int recode_loop;
+ int iterative_sub_pixel;
+ int half_pixel_search;
+ int quarter_pixel_search;
+ int thresh_mult[MAX_MODES];
+ int max_step_search_steps;
+ int first_step;
+ int optimize_coefficients;
+ int no_skip_block4x4_search;
+ int improved_mv_pred;
+ int search_best_filter;
+
+} SPEED_FEATURES;
+
+typedef struct {
+ MACROBLOCK mb;
+ int totalrate;
+} MB_ROW_COMP;
+
+typedef struct {
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct {
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct {
+ int ithread;
+ void *ptr1;
+} LPFTHREAD_DATA;
+
+
+typedef struct VP9_ENCODER_RTCD {
+ VP9_COMMON_RTCD *common;
+ vp9_search_rtcd_vtable_t search;
+ vp9_temporal_rtcd_vtable_t temporal;
+} VP9_ENCODER_RTCD;
+
+enum BlockSize {
+ BLOCK_16X8 = PARTITIONING_16X8,
+ BLOCK_8X16 = PARTITIONING_8X16,
+ BLOCK_8X8 = PARTITIONING_8X8,
+ BLOCK_4X4 = PARTITIONING_4X4,
+ BLOCK_16X16,
+ BLOCK_MAX_SEGMENTS,
+ BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+ BLOCK_MAX_SB_SEGMENTS,
+};
+
+typedef struct VP9_COMP {
+
+ DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);
+ DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);
+ DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);
+ DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);
+ DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
+ DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
+
+ DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
+ DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
+ DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
+
+ MACROBLOCK mb;
+ VP9_COMMON common;
+ VP9_CONFIG oxcf;
+
+ struct lookahead_ctx *lookahead;
+ struct lookahead_entry *source;
+ struct lookahead_entry *alt_ref_source;
+
+ YV12_BUFFER_CONFIG *Source;
+ YV12_BUFFER_CONFIG *un_scaled_source;
+ YV12_BUFFER_CONFIG scaled_source;
+
+ int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
+ int source_alt_ref_active; // an alt ref frame has been encoded and is usable
+
+ int is_src_frame_alt_ref; // source of frame to encode is an exact copy of an alt ref frame
+
+ int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
+ int alt_is_last; // Alt reference frame same as last ( short circuit altref search)
+ int gold_is_alt; // don't do both alt and gold search ( just do gold).
+
+ // int refresh_alt_ref_frame;
+ YV12_BUFFER_CONFIG last_frame_uf;
+
+ TOKENEXTRA *tok;
+ unsigned int tok_count;
+
+
+ unsigned int frames_since_key;
+ unsigned int key_frame_frequency;
+ unsigned int this_key_frame_forced;
+ unsigned int next_key_frame_forced;
+
+ // Ambient reconstruction err target for force key frames
+ int ambient_err;
+
+ unsigned int mode_check_freq[MAX_MODES];
+ unsigned int mode_test_hit_counts[MAX_MODES];
+ unsigned int mode_chosen_counts[MAX_MODES];
+
+ int rd_thresh_mult[MAX_MODES];
+ int rd_baseline_thresh[MAX_MODES];
+ int rd_threshes[MAX_MODES];
+ int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
+ int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+ int comp_pred_count[COMP_PRED_CONTEXTS];
+ int single_pred_count[COMP_PRED_CONTEXTS];
+ // FIXME contextualize
+ int txfm_count[TX_SIZE_MAX];
+ int txfm_count_8x8p[TX_SIZE_MAX - 1];
+ int64_t rd_tx_select_diff[NB_TXFM_MODES];
+ int rd_tx_select_threshes[4][NB_TXFM_MODES];
+
+ int RDMULT;
+ int RDDIV;
+
+ CODING_CONTEXT coding_context;
+
+ // Rate targetting variables
+ int64_t prediction_error;
+ int64_t last_prediction_error;
+ int64_t intra_error;
+ int64_t last_intra_error;
+
+ int this_frame_target;
+ int projected_frame_size;
+ int last_q[2]; // Separate values for Intra/Inter
+ int last_boosted_qindex; // Last boosted GF/KF/ARF q
+
+ double rate_correction_factor;
+ double key_frame_rate_correction_factor;
+ double gf_rate_correction_factor;
+
+ int frames_till_gf_update_due; // Count down till next GF
+ int current_gf_interval; // GF interval chosen when we coded the last GF
+
+ int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative)
+
+ int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF
+
+ int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames
+ int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame.
+ int max_gf_interval;
+ int baseline_gf_interval;
+ int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames
+
+ int64_t key_frame_count;
+ int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+ int per_frame_bandwidth; // Current section per frame bandwidth target
+ int av_per_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation that should be used for any frame
+ int inter_frame_target;
+ double output_frame_rate;
+ int64_t last_time_stamp_seen;
+ int64_t last_end_time_stamp_seen;
+ int64_t first_time_stamp_ever;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex;
+ double tot_q;
+ double avg_q;
+
+ int zbin_over_quant;
+ int zbin_mode_boost;
+ int zbin_mode_boost_enabled;
+
+ int64_t total_byte_count;
+
+ int buffered_mode;
+
+ int buffer_level;
+ int bits_off_target;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ int64_t total_actual_bits;
+ int total_target_vs_actual; // debug stats
+
+ int worst_quality;
+ int active_worst_quality;
+ int best_quality;
+ int active_best_quality;
+
+ int cq_target_quality;
+
+#if CONFIG_SUPERBLOCKS
+ int sb_count;
+ int sb_ymode_count [VP9_I32X32_MODES];
+#endif
+ int ymode_count [VP9_YMODES]; /* intra MB type cts this frame */
+ int bmode_count [VP9_BINTRAMODES];
+ int i8x8_mode_count [VP9_I8X8_MODES];
+ int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];
+ int mbsplit_count [VP9_NUMMBSPLITS];
+ // int uv_mode_count[VP9_UV_MODES]; /* intra MB type cts this frame */
+ int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
+
+ nmv_context_counts NMVcount;
+
+ unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+ unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+ unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+ unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+ unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+ unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */
+ vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+ int gfu_boost;
+ int last_boost;
+ int kf_boost;
+ int kf_zeromotion_pct;
+
+ int target_bandwidth;
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+#if 0
+ // Experimental code for lagged and one pass
+ ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+ int one_pass_frame_index;
+#endif
+ MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+ int mbgraph_n_frames; // number of frames filled in the above
+ int static_mb_pct; // % forced skip mbs by segmentation
+ int seg0_progress, seg0_idx, seg0_cnt;
+ int ref_pred_count[3][2];
+
+ int decimation_factor;
+ int decimation_count;
+
+ // for real time encoding
+ int avg_encode_time; // microsecond
+ int avg_pick_mode_time; // microsecond
+ int Speed;
+ unsigned int cpu_freq; // Mhz
+ int compressor_speed;
+
+ int interquantizer;
+ int goldfreq;
+ int auto_worst_q;
+ int cpu_used;
+ int horiz_scale;
+ int vert_scale;
+ int pass;
+
+ vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
+ int last_skip_probs_q[3];
+
+ int recent_ref_frame_usage[MAX_REF_FRAMES];
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+ int ref_frame_flags;
+
+ unsigned char ref_pred_probs_update[PREDICTION_PROBS];
+
+ SPEED_FEATURES sf;
+ int error_bins[1024];
+
+ // Data used for real time conferencing mode to help determine if it would be good to update the gf
+ int inter_zz_count;
+ int gf_bad_count;
+ int gf_update_recommended;
+ int skip_true_count[3];
+ int skip_false_count[3];
+
+ unsigned char *segmentation_map;
+
+ // segment threashold for encode breakout
+ int segment_encode_breakout[MAX_MB_SEGMENTS];
+
+ unsigned char *active_map;
+ unsigned int active_map_enabled;
+
+ TOKENLIST *tplist;
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ vp9_full_search_fn_t full_search_sad;
+ vp9_refining_search_fn_t refining_search_sad;
+ vp9_diamond_search_fn_t diamond_search_sad;
+ vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
+ uint64_t time_receive_data;
+ uint64_t time_compress_data;
+ uint64_t time_pick_lpf;
+ uint64_t time_encode_mb_row;
+
+ int base_skip_false_prob[QINDEX_RANGE][3];
+
+ struct twopass_rc {
+ unsigned int section_intra_rating;
+ unsigned int next_iiratio;
+ unsigned int this_iiratio;
+ FIRSTPASS_STATS *total_stats;
+ FIRSTPASS_STATS *this_frame_stats;
+ FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+ FIRSTPASS_STATS *total_left_stats;
+ int first_pass_done;
+ int64_t bits_left;
+ int64_t clip_bits_total;
+ double avg_iiratio;
+ double modified_error_total;
+ double modified_error_used;
+ double modified_error_left;
+ double kf_intra_err_min;
+ double gf_intra_err_min;
+ int frames_to_key;
+ int maxq_max_limit;
+ int maxq_min_limit;
+ int static_scene_max_gf_interval;
+ int kf_bits;
+ int gf_group_error_left; // Remaining error from uncoded frames in a gf group. Two pass use only
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ int64_t kf_group_error_left;
+
+ int gf_group_bits; // Projected Bits available for a group of frames including 1 GF or ARF
+ int gf_bits; // Bits for the golden frame or ARF - 2 pass only
+ int alt_extra_bits;
+
+ int sr_update_lag;
+ double est_max_qcorrection_factor;
+ } twopass;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+ VP9_ENCODER_RTCD rtcd;
+#endif
+#if VP9_TEMPORAL_ALT_REF
+ YV12_BUFFER_CONFIG alt_ref_buffer;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+ int fixed_divide[512];
+#endif
+
+#if CONFIG_INTERNAL_STATS
+ int count;
+ double total_y;
+ double total_u;
+ double total_v;
+ double total;
+ double total_sq_error;
+ double totalp_y;
+ double totalp_u;
+ double totalp_v;
+ double totalp;
+ double total_sq_error2;
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ unsigned int tot_recode_hits;
+
+
+ double total_ssimg_y;
+ double total_ssimg_u;
+ double total_ssimg_v;
+ double total_ssimg_all;
+
+ int b_calculate_ssimg;
+#endif
+ int b_calculate_psnr;
+
+ // Per MB activity measurement
+ unsigned int activity_avg;
+ unsigned int *mb_activity_map;
+ int *mb_norm_activity_map;
+
+ // Record of which MBs still refer to last golden frame either
+ // directly or through 0,0
+ unsigned char *gf_active_flags;
+ int gf_active_count;
+
+ int output_partition;
+
+ // Store last frame's MV info for next frame MV prediction
+ int_mv *lfmv;
+ int *lf_ref_frame_sign_bias;
+ int *lf_ref_frame;
+
+ /* force next frame to intra when kf_auto says so */
+ int force_next_frame_intra;
+
+ int droppable;
+
+ // TODO Do we still need this??
+ int update_context;
+
+ int dummy_packing; /* flag to indicate if packing is dummy */
+
+#if CONFIG_PRED_FILTER
+ int pred_filter_on_count;
+ int pred_filter_off_count;
+#endif
+ unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS];
+
+#if CONFIG_NEW_MVREF
+ unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];
+#endif
+
+} VP9_COMP;
+
+void vp9_encode_frame(VP9_COMP *cpi);
+
+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
+ unsigned long *size);
+
+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
+
+void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
+
+void vp9_set_speed_features(VP9_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+#endif // __INC_ONYX_INT_H
--- /dev/null
+++ b/vp9/encoder/picklpf.c
@@ -1,0 +1,420 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/loopfilter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc,
+ int fraction);
+
+void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int border;
+ int yoffset;
+ int linestocopy;
+
+ border = src_ybc->border;
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ linestocopy = (yheight >> (Fraction + 4));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+ yoffset = ystride * ((yheight >> 5) * 16 - 8);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));
+}
+
+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest, int Fraction) {
+ int i, j;
+ int Total = 0;
+ int srcoffset, dstoffset;
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ int linestocopy = (source->y_height >> (Fraction + 4));
+
+ if (linestocopy < 1)
+ linestocopy = 1;
+
+ linestocopy <<= 4;
+
+
+ srcoffset = source->y_stride * (dest->y_height >> 5) * 16;
+ dstoffset = dest->y_stride * (dest->y_height >> 5) * 16;
+
+ src += srcoffset;
+ dst += dstoffset;
+
+ // Loop through the Y plane raw and reconstruction data summing (square differences)
+ for (i = 0; i < linestocopy; i += 16) {
+ for (j = 0; j < source->y_width; j += 16) {
+ unsigned int sse;
+ Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
+ &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+// Enforce a minimum filter level based upon baseline Q
+static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
+ int min_filter_level;
+ /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
+
+ if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
+ min_filter_level = 0;
+ else
+ {
+ if (q <= 10)
+ min_filter_level = 0;
+ else if (q <= 64)
+ min_filter_level = 1;
+ else
+ min_filter_level = (q >> 6);
+ }
+ */
+ min_filter_level = 0;
+
+ return min_filter_level;
+}
+
+// Enforce a maximum filter level based upon baseline Q
+static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
+ // PGW August 2006: Highest filter values almost always a bad idea
+
+ // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
+ // with lots of intra coming in.
+ int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+ (void)base_qindex;
+
+ if (cpi->twopass.section_intra_rating > 8)
+ max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+ return max_filter_level;
+}
+
+void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+ int filt_val;
+ int best_filt_val = cm->filter_level;
+
+ // Make a copy of the unfiltered / processed recon buffer
+ vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->sharpness_level != cm->last_sharpness_level) {
+ vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ // Start the search at the previous frame filter level unless it is now out of range.
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+ else if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+
+ filt_val = cm->filter_level;
+ best_filt_val = filt_val;
+
+ // Get the err using the previous frame's filter value.
+ vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+ // Re-instate the unfiltered frame
+ vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+ filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+
+ // Search lower filter levels
+ while (filt_val >= min_filter_level) {
+ // Apply the loop filter
+ vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ // Get the err for filtered frame
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+ // Re-instate the unfiltered frame
+ vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+
+ // Update the best case record or exit loop.
+ if (filt_err < best_err) {
+ best_err = filt_err;
+ best_filt_val = filt_val;
+ } else
+ break;
+
+ // Adjust filter level
+ filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+ }
+
+ // Search up (note that we have already done filt_val = cm->filter_level)
+ filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+
+ if (best_filt_val == cm->filter_level) {
+ // Resist raising filter level for very small gains
+ best_err -= (best_err >> 10);
+
+ while (filt_val < max_filter_level) {
+ // Apply the loop filter
+ vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ // Get the err for filtered frame
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
+
+ // Re-instate the unfiltered frame
+ vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,
+ cm->frame_to_show, 3);
+
+ // Update the best case record or exit loop.
+ if (filt_err < best_err) {
+ // Do not raise filter level if improvement is < 1 part in 4096
+ best_err = filt_err - (filt_err >> 10);
+
+ best_filt_val = filt_val;
+ } else
+ break;
+
+ // Adjust filter level
+ filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+ }
+ }
+
+ cm->filter_level = best_filt_val;
+
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+
+ if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+}
+
+// Stub function for now Alt LF not used
+void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
+}
+
+void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+ int filter_step;
+ int filt_high = 0;
+ int filt_mid = cm->filter_level; // Start search at previous frame filter level
+ int filt_low = 0;
+ int filt_best;
+ int filt_direction = 0;
+
+ int Bias = 0; // Bias against raising loop filter and in favour of lowering it
+
+ // Make a copy of the unfiltered / processed recon buffer
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
+ }
+#if CONFIG_RUNTIME_CPU_DETECT
+ else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+ {
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+ }
+#endif
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ // Start the search at the previous frame filter level unless it is now out of range.
+ filt_mid = cm->filter_level;
+
+ if (filt_mid < min_filter_level)
+ filt_mid = min_filter_level;
+ else if (filt_mid > max_filter_level)
+ filt_mid = max_filter_level;
+
+ // Define the initial step size
+ filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+ // Get baseline error score
+ vp9_set_alt_lf_level(cpi, filt_mid);
+ vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+
+ best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+ filt_best = filt_mid;
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#if CONFIG_RUNTIME_CPU_DETECT
+ else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+ {
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#endif
+
+ while (filter_step > 0) {
+ Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+
+ // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
+ if (cpi->twopass.section_intra_rating < 20)
+ Bias = Bias * cpi->twopass.section_intra_rating / 20;
+
+ // yx, bias less for large block size
+ if (cpi->common.txfm_mode != ONLY_4X4)
+ Bias >>= 1;
+
+ filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+ filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+ if ((filt_direction <= 0) && (filt_low != filt_mid)) {
+ // Get Low filter error score
+ vp9_set_alt_lf_level(cpi, filt_low);
+ vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+ filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#if CONFIG_RUNTIME_CPU_DETECT
+ else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+ {
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#endif
+
+ // If value is close to the best so far then bias towards a lower loop filter value.
+ if ((filt_err - Bias) < best_err) {
+ // Was it actually better than the previous best?
+ if (filt_err < best_err)
+ best_err = filt_err;
+
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if ((filt_direction >= 0) && (filt_high != filt_mid)) {
+ vp9_set_alt_lf_level(cpi, filt_high);
+ vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+
+ filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+
+ // Re-instate the unfiltered frame
+#if HAVE_ARMV7
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->rtcd.flags & HAS_NEON)
+#endif
+ {
+ vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#if CONFIG_RUNTIME_CPU_DETECT
+ else
+#endif
+#endif
+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT
+ {
+ vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+ }
+#endif
+
+ // Was it better than the previous best?
+ if (filt_err < (best_err - Bias)) {
+ best_err = filt_err;
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step = filter_step / 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ cm->filter_level = filt_best;
+}
+
--- /dev/null
+++ b/vp9/encoder/ppc/csystemdependent.c
@@ -1,0 +1,155 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+SADFunction *vp9_sad16x16;
+SADFunction *vp9_sad16x8;
+SADFunction *vp9_sad8x16;
+SADFunction *vp9_sad8x8;
+SADFunction *vp9_sad4x4;
+
+variance_function *vp9_variance4x4;
+variance_function *vp9_variance8x8;
+variance_function *vp9_variance8x16;
+variance_function *vp9_variance16x8;
+variance_function *vp9_variance16x16;
+
+variance_function *vp9_mse16x16;
+
+sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
+
+int (*vp9_block_error)(short *coeff, short *dqcoeff);
+int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp9_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp9_get_mb_ss)(short *);
+void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp9_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp9_get_mb_ss_c(short *);
+
+// ppc
+extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp9_sad16x16_ppc;
+extern SADFunction vp9_sad16x8_ppc;
+extern SADFunction vp9_sad8x16_ppc;
+extern SADFunction vp9_sad8x8_ppc;
+extern SADFunction vp9_sad4x4_ppc;
+
+extern variance_function vp9_variance16x16_ppc;
+extern variance_function vp9_variance8x16_ppc;
+extern variance_function vp9_variance16x8_ppc;
+extern variance_function vp9_variance8x8_ppc;
+extern variance_function vp9_variance4x4_ppc;
+extern variance_function vp9_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+
+void vp9_cmachine_specific_config(void) {
+ // Pure C:
+ vp9_mbuverror = vp9_mbuverror_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp9_short_fdct4x4 = vp9_short_fdct4x4_ppc;
+ vp9_short_fdct8x4 = vp9_short_fdct8x4_ppc;
+ vp8_fast_fdct4x4 = vp9_short_fdct4x4_ppc;
+ vp8_fast_fdct8x4 = vp9_short_fdct8x4_ppc;
+ short_walsh4x4 = vp9_short_walsh4x4_c;
+
+ vp9_variance4x4 = vp9_variance4x4_ppc;
+ vp9_variance8x8 = vp9_variance8x8_ppc;
+ vp9_variance8x16 = vp9_variance8x16_ppc;
+ vp9_variance16x8 = vp9_variance16x8_ppc;
+ vp9_variance16x16 = vp9_variance16x16_ppc;
+ vp9_mse16x16 = vp9_mse16x16_ppc;
+
+ vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ppc;
+ vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ppc;
+ vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ppc;
+ vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ppc;
+ vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ppc;
+
+ vp9_get_mb_ss = vp9_get_mb_ss_c;
+
+ vp9_sad16x16 = vp9_sad16x16_ppc;
+ vp9_sad16x8 = vp9_sad16x8_ppc;
+ vp9_sad8x16 = vp9_sad8x16_ppc;
+ vp9_sad8x8 = vp9_sad8x8_ppc;
+ vp9_sad4x4 = vp9_sad4x4_ppc;
+
+ vp9_block_error = vp9_block_error_ppc;
+ vp9_mbblock_error = vp9_mbblock_error_c;
+
+ vp9_subtract_b = vp9_subtract_b_c;
+ vp9_subtract_mby = vp9_subtract_mby_ppc;
+ vp9_subtract_mbuv = vp9_subtract_mbuv_ppc;
+}
--- /dev/null
+++ b/vp9/encoder/ppc/encodemb_altivec.asm
@@ -1,0 +1,153 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_subtract_mbuv_ppc
+ .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r9, 256
+ add r3, r3, r9
+ add r3, r3, r9
+ add r6, r6, r9
+
+ li r10, 16
+ li r9, 4
+ mtctr r9
+
+ vspltisw v0, 0
+
+mbu_loop:
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r4, r4, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+
+ add r4, r4, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbu_loop
+
+ mtctr r9
+
+mbv_loop:
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r5, r5, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+
+ add r5, r5, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbv_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+ vspltisw v0, 0
+
+mby_loop:
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r5 ;# pred
+
+ add r4, r4, r6
+ addi r5, r5, 16
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vmrglb v3, v0, v1 ;# unpack low src to short
+ vmrglb v4, v0, v2 ;# unpack low pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mby_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
--- /dev/null
+++ b/vp9/encoder/ppc/fdct_altivec.asm
@@ -1,0 +1,205 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_short_fdct4x4_ppc
+ .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;# in normalization (fwd is twice unitary, inv is half unitary)
+;# and that they are of course transposes of each other.
+;#
+;# The following three accomplish most of implementation and
+;# are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ li r6, 16
+
+ load_c v0, dct_tab, 0, r9, r10
+ lvx v1, r6, r10
+ addi r10, r10, 32
+ lvx v2, 0, r10
+ lvx v3, r6, r10
+
+ load_c v4, ppc_dctperm_tab, 0, r9, r10
+ load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+ load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
+;# a/A are the even rows 0,2 b/B are the odd rows 1,3
+;# For fwd transform, indices are horizontal positions, then frequencies.
+;# For inverse transform, frequencies then positions.
+;# The two resulting A0..A3 B0..B3 are later combined
+;# and vertically transformed.
+
+.macro two_rows_horiz Dst
+ vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
+
+ vmsumshm v10, v0, v8, v6
+ vmsumshm v10, v1, v9, v10
+ vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
+
+ vmsumshm v11, v2, v8, v6
+ vmsumshm v11, v3, v9, v11
+ vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
+
+ vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
+ vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;# forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+ vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
+ vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v10, v8, v7
+
+ vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
+ vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v8, v8, v7
+
+ vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
+.endm
+
+.macro two_rows_h Dest
+ stw r0, 0(r8)
+ lwz r0, 4(r3)
+ stw r0, 4(r8)
+ lwzux r0, r3,r5
+ stw r0, 8(r8)
+ lwz r0, 4(r3)
+ stw r0, 12(r8)
+ lvx v8, 0,r8
+ two_rows_horiz \Dest
+.endm
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+ addi r10, r3, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ ;# Next block
+ addi r3, r10, 8
+ addi r4, r4, 32
+ lvx v6, 0, r9 ;# v6 = Hround
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .data
+ .align 4
+ppc_dctperm_tab:
+ .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+ .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+ .align 4
+dct_tab:
+ .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+ .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+ .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+ .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+ .align 4
+round_tab:
+ .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+ .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
--- /dev/null
+++ b/vp9/encoder/ppc/rdopt_altivec.asm
@@ -1,0 +1,51 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_block_error_ppc
+
+ .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ stw r5, 12(r1) ;# tranfer dc to vector register
+
+ lvx v0, 0, r3 ;# Coeff
+ lvx v1, 0, r4 ;# dqcoeff
+
+ li r10, 16
+
+ vspltisw v3, 0
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v2, v0, v0, v3 ;# multiply differences
+
+ lvx v0, r10, r3 ;# Coeff
+ lvx v1, r10, r4 ;# dqcoeff
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v1, v0, v0, v2 ;# multiply differences
+ vsumsws v1, v1, v3 ;# sum up
+
+ stvx v1, 0, r1
+ lwz r3, 12(r1) ;# return value
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
--- /dev/null
+++ b/vp9/encoder/ppc/sad_altivec.asm
@@ -1,0 +1,277 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_sad16x16_ppc
+ .globl vp8_sad16x8_ppc
+ .globl vp8_sad8x16_ppc
+ .globl vp8_sad8x8_ppc
+ .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v8, 0 ;# zero out total to start
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+ lvsl v3, 0, r5 ;# only needs to be done once per block
+
+ ;# preload a line of data before getting into the loop
+ lvx v4, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ add r5, r5, r6
+ add r3, r3, r4
+
+ vperm v5, v1, v2, v3
+
+ .align 4
+\loop_label:
+ ;# compute difference on first row
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+
+ ;# load up next set of data
+ lvx v9, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ ;# perform abs() of difference
+ vor v6, v6, v7
+ add r3, r3, r4
+
+ ;# add to the running tally
+ vsum4ubs v8, v6, v8
+
+ ;# now onto the next line
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+ lvx v4, 0, r3
+
+ ;# compute difference on second row
+ vsububs v6, v9, v5
+ lvx v1, 0, r5
+ vsububs v7, v5, v9
+ lvx v2, r10, r5
+ vor v6, v6, v7
+ add r3, r3, r4
+ vsum4ubs v8, v6, v8
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+ .align 4
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v7, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v7
+
+ SAD_16
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_16_loop sad16x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_16_loop sad16x8_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_8_loop sad8x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_8_loop sad8x8_loop
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r7, 0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r7, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ vspltisw v8, 0 ;# zero out total to start
+
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v7, v6, v8
+ vsumsws v7, v7, v8
+
+ stvx v7, 0, r1
+ lwz r3, 12(r1)
+
+ epilogue
+
+ blr
--- /dev/null
+++ b/vp9/encoder/ppc/variance_altivec.asm
@@ -1,0 +1,375 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_get8x8var_ppc
+ .globl vp8_get16x16var_ppc
+ .globl vp8_mse16x16_ppc
+ .globl vp9_variance16x16_ppc
+ .globl vp9_variance16x8_ppc
+ .globl vp9_variance8x16_ppc
+ .globl vp9_variance8x8_ppc
+ .globl vp9_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v7, 0 ;# zero for merging
+ vspltisw v8, 0 ;# zero out total to start
+ vspltisw v9, 0 ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb v2, v7, v4
+ vmrghb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ vmrglb v2, v7, v4
+ vmrglb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> DS
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v0, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v0
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, get8x8var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, get16x16var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+ prologue
+
+ mtctr r10
+
+mse16x16_loop:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+
+ bdnz mse16x16_loop
+
+ vsumsws v9, v9, v7
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stw r3, 0(r7) ;# sse
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance16x16_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, variance16x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance16x8_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_16 7, variance16x8_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_8 7, variance8x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, variance8x8_loop, 0
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r10,0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r10, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp9_variance4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ compute_sum_sse
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, 4 ;# (sum*sum) >> 4
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
+
+ epilogue
+
+ blr
--- /dev/null
+++ b/vp9/encoder/ppc/variance_subpixel_altivec.asm
@@ -1,0 +1,865 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp9_sub_pixel_variance4x4_ppc
+ .globl vp9_sub_pixel_variance8x8_ppc
+ .globl vp9_sub_pixel_variance8x16_ppc
+ .globl vp9_sub_pixel_variance16x8_ppc
+ .globl vp9_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r12, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r12, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r12, r0
+
+ ;# index to the next set of vectors in the row.
+ li r12, 32
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb \t1, \z0, \src
+ vmrghb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ vmrglb \t1, \z0, \src
+ vmrglb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ ;# Now compute sse.
+ vsububs \t1, \src, \ref
+ vsububs \t2, \ref, \src
+ vor \t1, \t1, \t2
+
+ vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+ vsumsws \sum, \sum, \z0
+ vsumsws \sse, \sse, \z0
+
+ stvx \sum, 0, r1
+ lwz r3, 12(r1)
+
+ stvx \sse, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r9) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srawi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+ load_and_align_16 v16, r7, r8, \increment_counter
+ compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+ lvsl v17, 0, \R ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, \R
+ lvx v22, r10, \R
+
+.if \increment_counter
+ add \R, \R, \P
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_4x4_b
+
+ hfilter_8 v4, v10, v11, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+compute_sum_sse_4x4_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_c v10, b_hilo_b, 0, r12, r0
+
+ vperm v0, v0, v1, v10
+ vperm v1, v2, v3, v10
+
+ compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 4
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+ hfilter_8 v4, v10, v11, 1
+ hfilter_8 v5, v10, v11, 1
+ hfilter_8 v6, v10, v11, 1
+ hfilter_8 v7, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x8_b
+
+ hfilter_8 v8, v10, v11, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 0
+
+ beq compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 0
+
+ vmrghb v4, v4, v5
+ vmrghb v5, v6, v7
+ vmrghb v6, v8, v9
+ vmrghb v7, v10, v11
+
+ compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 6
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance8x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x16_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v29, b_0123_b, 0, r12, r0
+ load_c v30, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v29, v30, 1
+ hfilter_8 v1, v29, v30, 1
+ hfilter_8 v2, v29, v30, 1
+ hfilter_8 v3, v29, v30, 1
+ hfilter_8 v4, v29, v30, 1
+ hfilter_8 v5, v29, v30, 1
+ hfilter_8 v6, v29, v30, 1
+ hfilter_8 v7, v29, v30, 1
+ hfilter_8 v8, v29, v30, 1
+ hfilter_8 v9, v29, v30, 1
+ hfilter_8 v10, v29, v30, 1
+ hfilter_8 v11, v29, v30, 1
+ hfilter_8 v12, v29, v30, 1
+ hfilter_8 v13, v29, v30, 1
+ hfilter_8 v14, v29, v30, 1
+ hfilter_8 v15, v29, v30, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x16_b
+
+ hfilter_8 v16, v29, v30, 0
+
+ b second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+ vmrghb v4, v8, v9
+ vmrghb v5, v10, v11
+ vmrghb v6, v12, v13
+ vmrghb v7, v14, v15
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 1
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v0, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 0
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v4, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v5, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance16x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x8_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x8_b
+
+ hfilter_16 v8, 0
+
+ b second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+
+ beq compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_16x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 0
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp9_sub_pixel_variance16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 1
+ compute_sum_sse_16 v8, 1
+ compute_sum_sse_16 v9, 1
+ compute_sum_sse_16 v10, 1
+ compute_sum_sse_16 v11, 1
+ compute_sum_sse_16 v12, 1
+ compute_sum_sse_16 v13, 1
+ compute_sum_sse_16 v14, 1
+ compute_sum_sse_16 v15, 0
+
+ variance_final v18, v19, v23, 8
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
--- /dev/null
+++ b/vp9/encoder/psnr.c
@@ -1,0 +1,30 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "vp9/common/systemdependent.h" /* for vp9_clear_system_state() */
+
+#define MAX_PSNR 100
+
+double vp9_mse2psnr(double Samples, double Peak, double Mse) {
+ double psnr;
+
+ if ((double)Mse > 0.0)
+ psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+ else
+ psnr = MAX_PSNR; // Limit to prevent / 0
+
+ if (psnr > MAX_PSNR)
+ psnr = MAX_PSNR;
+
+ return psnr;
+}
--- /dev/null
+++ b/vp9/encoder/psnr.h
@@ -1,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/quantize.c
@@ -1,0 +1,716 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vp9/common/quant_common.h"
+
+#include "vp9/common/seg_common.h"
+
+#ifdef ENC_DEBUG
+extern int enc_debug;
+#endif
+
+void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ int const *pt_scan ;
+
+ switch (tx_type) {
+ case ADST_DCT :
+ pt_scan = vp9_row_scan;
+ break;
+
+ case DCT_ADST :
+ pt_scan = vp9_col_scan;
+ break;
+
+ default :
+ pt_scan = vp9_default_zig_zag1d;
+ break;
+ }
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < b->eob_max_offset; i++) {
+ rc = pt_scan[i];
+ z = coeff_ptr[rc];
+
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+ zbin_boost_ptr ++;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += round_ptr[rc];
+ y = (((x * quant_ptr[rc]) >> 16) + x)
+ >> quant_shift_ptr[rc]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < b->eob_max_offset; i++) {
+ rc = vp9_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+ zbin_boost_ptr ++;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += round_ptr[rc];
+
+ y = (((x * quant_ptr[rc]) >> 16) + x)
+ >> quant_shift_ptr[rc]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {
+ int i;
+ int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+ for (i = 0; i < 16; i++)
+ x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+
+ if (has_2nd_order)
+ x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {
+ int i;
+
+ for (i = 16; i < 24; i++)
+ x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);
+}
+
+void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {
+ vp9_quantize_mby_4x4_c(x);
+ vp9_quantize_mbuv_4x4_c(x);
+}
+
+void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ int zbin_zrun_index = 0;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+ // double q2nd = 4;
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < b->eob_max_offset_8x8; i++) {
+ rc = vp9_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
+ zbin_zrun_index += 4;
+ zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += (round_ptr[rc]);
+ y = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)
+ >> quant_shift_ptr[rc]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zbin_zrun_index = 0;
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin_8x8;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));
+ vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));
+
+ eob = -1;
+
+ for (i = 0; i < b->eob_max_offset_8x8; i++) {
+ rc = vp9_default_zig_zag1d_8x8[i];
+ z = coeff_ptr[rc];
+
+ zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);
+ zbin_boost_ptr++;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += (round_ptr[rc != 0]);
+ y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+ >> quant_shift_ptr[rc != 0]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+void vp9_quantize_mby_8x8(MACROBLOCK *x) {
+ int i;
+ int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;
+
+ for (i = 0; i < 16; i ++) {
+ x->e_mbd.block[i].eob = 0;
+ }
+ x->e_mbd.block[24].eob = 0;
+ for (i = 0; i < 16; i += 4)
+ x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+
+ if (has_2nd_order)
+ x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
+ int i;
+
+ for (i = 16; i < 24; i ++)
+ x->e_mbd.block[i].eob = 0;
+ for (i = 16; i < 24; i += 4)
+ x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+}
+
+void vp9_quantize_mb_8x8(MACROBLOCK *x) {
+ vp9_quantize_mby_8x8(x);
+ vp9_quantize_mbuv_8x8(x);
+}
+
+void vp9_quantize_mby_16x16(MACROBLOCK *x) {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ x->e_mbd.block[i].eob = 0;
+ x->e_mbd.block[24].eob = 0;
+ x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
+}
+
+void vp9_quantize_mb_16x16(MACROBLOCK *x) {
+ vp9_quantize_mby_16x16(x);
+ vp9_quantize_mbuv_8x8(x);
+}
+
+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin_16x16;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
+ vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+
+ eob = -1;
+ for (i = 0; i < b->eob_max_offset_16x16; i++) {
+ rc = vp9_default_zig_zag1d_16x16[i];
+ z = coeff_ptr[rc];
+
+ zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
+ zbin_boost_ptr ++;
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += (round_ptr[rc!=0]);
+ y = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+ >> quant_shift_ptr[rc!=0]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0]; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+ }
+ }
+ }
+
+ d->eob = eob + 1;
+}
+
+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,
+ BLOCKD *d1, BLOCKD *d2) {
+ vp9_regular_quantize_b_4x4(b1, d1);
+ vp9_regular_quantize_b_4x4(b2, d2);
+}
+
+static void invert_quant(short *quant,
+ unsigned char *shift, short d) {
+ unsigned t;
+ int l;
+ t = d;
+ for (l = 0; t > 1; l++)
+ t >>= 1;
+ t = 1 + (1 << (16 + l)) / d;
+ *quant = (short)(t - (1 << 16));
+ *shift = l;
+}
+
+void vp9_init_quantizer(VP9_COMP *cpi) {
+ int i;
+ int quant_val;
+ int Q;
+ static const int zbin_boost[16] = { 0, 0, 8, 10, 12, 14, 16, 20,
+ 24, 28, 32, 36, 40, 44, 44, 44
+ };
+
+ static const int zbin_boost_8x8[64] = { 0, 0, 0, 8, 8, 8, 10, 12,
+ 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 36, 38, 40, 42, 44,
+ 46, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48
+ };
+ static const int zbin_boost_16x16[256] = {
+ 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ };
+ int qrounding_factor = 48;
+
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++) {
+ int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;
+
+#if CONFIG_LOSSLESS
+ if (cpi->oxcf.lossless) {
+ if (Q == 0) {
+ qzbin_factor = 64;
+ qrounding_factor = 64;
+ }
+ }
+#endif
+
+ // dc values
+ quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);
+ invert_quant(cpi->Y1quant[Q] + 0,
+ cpi->Y1quant_shift[Q] + 0, quant_val);
+ cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+ cpi->zrun_zbin_boost_y1_8x8[Q][0] =
+ ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+ cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+
+ quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
+ invert_quant(cpi->Y2quant[Q] + 0,
+ cpi->Y2quant_shift[Q] + 0, quant_val);
+ cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+ cpi->zrun_zbin_boost_y2_8x8[Q][0] =
+ ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+ cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+ quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+ invert_quant(cpi->UVquant[Q] + 0,
+ cpi->UVquant_shift[Q] + 0, quant_val);
+ cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.UVdequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+ cpi->zrun_zbin_boost_uv_8x8[Q][0] =
+ ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+ cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+
+ // all the 4x4 ac values =;
+ for (i = 1; i < 16; i++) {
+ int rc = vp9_default_zig_zag1d[i];
+
+ quant_val = vp9_ac_yquant(Q);
+ invert_quant(cpi->Y1quant[Q] + rc,
+ cpi->Y1quant_shift[Q] + rc, quant_val);
+ cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][i] =
+ ((quant_val * zbin_boost[i]) + 64) >> 7;
+
+ quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+ invert_quant(cpi->Y2quant[Q] + rc,
+ cpi->Y2quant_shift[Q] + rc, quant_val);
+ cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][i] =
+ ((quant_val * zbin_boost[i]) + 64) >> 7;
+
+ quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ invert_quant(cpi->UVquant[Q] + rc,
+ cpi->UVquant_shift[Q] + rc, quant_val);
+ cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;
+ cpi->common.UVdequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][i] =
+ ((quant_val * zbin_boost[i]) + 64) >> 7;
+ }
+
+ // 8x8 structures... only zbin seperated out for now
+ // This needs cleaning up for 8x8 especially if we are to add
+ // support for non flat Q matices
+ for (i = 1; i < 64; i++) {
+ int rc = vp9_default_zig_zag1d_8x8[i];
+
+ quant_val = vp9_ac_yquant(Q);
+ cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_y1_8x8[Q][i] =
+ ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+
+ quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_y2_8x8[Q][i] =
+ ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+
+ quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_uv_8x8[Q][i] =
+ ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
+ }
+
+ // 16x16 structures. Same comment above applies.
+ for (i = 1; i < 256; i++) {
+ int rc = vp9_default_zig_zag1d_16x16[i];
+
+ quant_val = vp9_ac_yquant(Q);
+ cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+ quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+ quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+ cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+ }
+ }
+}
+
+void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
+ int i;
+ int QIndex;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int zbin_extra;
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+ // Select the baseline MB Q index allowing for any segment level change.
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+ // Abs Value
+ if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
+ QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+ // Delta Value
+ else {
+ QIndex = cpi->common.base_qindex +
+ vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+
+ // Clamp to valid range
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
+ }
+ } else
+ QIndex = cpi->common.base_qindex;
+
+ // Y
+ zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
+ (cpi->zbin_over_quant +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+
+ for (i = 0; i < 16; i++) {
+ x->block[i].quant = cpi->Y1quant[QIndex];
+ x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
+ x->block[i].zbin = cpi->Y1zbin[QIndex];
+ x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
+ x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+ x->block[i].round = cpi->Y1round[QIndex];
+ x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+ x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
+ x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ // Segment max eob offset feature.
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+ x->block[i].eob_max_offset =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ x->block[i].eob_max_offset_8x8 =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ x->block[i].eob_max_offset_16x16 =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ } else {
+ x->block[i].eob_max_offset = 16;
+ x->block[i].eob_max_offset_8x8 = 64;
+ x->block[i].eob_max_offset_16x16 = 256;
+ }
+ }
+
+ // UV
+ zbin_extra = (cpi->common.UVdequant[QIndex][1] *
+ (cpi->zbin_over_quant +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+
+ for (i = 16; i < 24; i++) {
+ x->block[i].quant = cpi->UVquant[QIndex];
+ x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
+ x->block[i].zbin = cpi->UVzbin[QIndex];
+ x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
+ x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
+ x->block[i].round = cpi->UVround[QIndex];
+ x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+ x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
+ x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
+
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ // Segment max eob offset feature.
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+ x->block[i].eob_max_offset =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ x->block[i].eob_max_offset_8x8 =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ } else {
+ x->block[i].eob_max_offset = 16;
+ x->block[i].eob_max_offset_8x8 = 64;
+ }
+ }
+
+ // Y2
+ zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
+ ((cpi->zbin_over_quant / 2) +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+
+ x->block[24].quant = cpi->Y2quant[QIndex];
+ x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
+ x->block[24].zbin = cpi->Y2zbin[QIndex];
+ x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
+ x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
+ x->block[24].round = cpi->Y2round[QIndex];
+ x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
+ x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+ x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
+ x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
+ x->block[24].zbin_extra = (short)zbin_extra;
+
+ // TBD perhaps not use for Y2
+ // Segment max eob offset feature.
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+ x->block[24].eob_max_offset =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ x->block[24].eob_max_offset_8x8 =
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+ } else {
+ x->block[24].eob_max_offset = 16;
+ x->block[24].eob_max_offset_8x8 = 4;
+ }
+
+ /* save this macroblock QIndex for vp9_update_zbin_extra() */
+ x->e_mbd.q_index = QIndex;
+}
+
+void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
+ int i;
+ int QIndex = x->e_mbd.q_index;
+ int zbin_extra;
+
+ // Y
+ zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
+ (cpi->zbin_over_quant +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+ for (i = 0; i < 16; i++) {
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ // UV
+ zbin_extra = (cpi->common.UVdequant[QIndex][1] *
+ (cpi->zbin_over_quant +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+
+ for (i = 16; i < 24; i++) {
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ // Y2
+ zbin_extra = (cpi->common.Y2dequant[QIndex][1] *
+ ((cpi->zbin_over_quant / 2) +
+ cpi->zbin_mode_boost +
+ x->act_zbin_adj)) >> 7;
+
+ x->block[24].zbin_extra = (short)zbin_extra;
+}
+
+void vp9_frame_init_quantizer(VP9_COMP *cpi) {
+ // Clear Zbin mode boost for default case
+ cpi->zbin_mode_boost = 0;
+
+ // MB level quantizer setup
+ vp9_mb_init_quantizer(cpi, &cpi->mb);
+}
+
+void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {
+ VP9_COMMON *cm = &cpi->common;
+
+ cm->base_qindex = Q;
+
+ // if any of the delta_q values are changing update flag will
+ // have to be set.
+ cm->y1dc_delta_q = 0;
+ cm->y2ac_delta_q = 0;
+ cm->uvdc_delta_q = 0;
+ cm->uvac_delta_q = 0;
+ cm->y2dc_delta_q = 0;
+
+ // quantizer has to be reinitialized if any delta_q changes.
+ // As there are not any here for now this is inactive code.
+ // if(update)
+ // vp9_init_quantizer(cpi);
+}
--- /dev/null
+++ b/vp9/encoder/quantize.h
@@ -1,0 +1,97 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+#include "block.h"
+
+#define prototype_quantize_block(sym) \
+ void (sym)(BLOCK *b,BLOCKD *d)
+
+#define prototype_quantize_block_pair(sym) \
+ void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+
+#define prototype_quantize_mb(sym) \
+ void (sym)(MACROBLOCK *x)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/quantize_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/quantize_arm.h"
+#endif
+
+#define prototype_quantize_block_type(sym) \
+ void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)
+extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);
+
+#ifndef vp9_quantize_quantb_4x4
+#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_4x4);
+
+#ifndef vp9_quantize_quantb_4x4_pair
+#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair
+#endif
+extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);
+
+#ifndef vp9_quantize_quantb_8x8
+#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_8x8);
+
+#ifndef vp9_quantize_quantb_16x16
+#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_16x16);
+
+#ifndef vp9_quantize_quantb_2x2
+#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2
+#endif
+extern prototype_quantize_block(vp9_quantize_quantb_2x2);
+
+#ifndef vp9_quantize_mb_4x4
+#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mb_4x4);
+void vp9_quantize_mb_8x8(MACROBLOCK *x);
+
+#ifndef vp9_quantize_mbuv_4x4
+#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);
+
+#ifndef vp9_quantize_mby_4x4
+#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c
+#endif
+extern prototype_quantize_mb(vp9_quantize_mby_4x4);
+
+extern prototype_quantize_mb(vp9_quantize_mby_8x8);
+extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);
+
+void vp9_quantize_mb_16x16(MACROBLOCK *x);
+extern prototype_quantize_block(vp9_quantize_quantb_16x16);
+extern prototype_quantize_mb(vp9_quantize_mby_16x16);
+
+struct VP9_COMP;
+
+extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
+
+extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+
+extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+extern void vp9_init_quantizer(struct VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/ratectrl.c
@@ -1,0 +1,698 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "vp9/common/alloccommon.h"
+#include "vp9/common/common.h"
+#include "ratectrl.h"
+#include "vp9/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/systemdependent.h"
+#include "encodemv.h"
+#include "vp9/common/quant_common.h"
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#ifdef MODE_STATS
+extern unsigned int y_modes[VP9_YMODES];
+extern unsigned int uv_modes[VP9_UV_MODES];
+extern unsigned int b_modes[B_MODE_COUNT];
+
+extern unsigned int inter_y_modes[MB_MODE_COUNT];
+extern unsigned int inter_uv_modes[VP9_UV_MODES];
+extern unsigned int inter_b_modes[B_MODE_COUNT];
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+// % adjustment to target kf size based on seperation from previous frame
+static const int kf_boost_seperation_adjustment[16] = {
+ 30, 40, 50, 55, 60, 65, 70, 75,
+ 80, 85, 90, 95, 100, 100, 100, 100,
+};
+
+static const int gf_adjust_table[101] = {
+ 100,
+ 115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+ 240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+ 350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+static const int gf_intra_usage_adjustment[20] = {
+ 125, 120, 115, 110, 105, 100, 95, 85, 80, 75,
+ 70, 65, 60, 55, 50, 50, 50, 50, 50, 50,
+};
+
+static const int gf_interval_table[101] = {
+ 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double vp9_convert_qindex_to_q(int qindex) {
+ // Convert the index to a real Q value (scaled down to match old Q values)
+ return (double)vp9_ac_yquant(qindex) / 4.0;
+}
+
+int vp9_gfboost_qadjust(int qindex) {
+ int retval;
+ double q;
+
+ q = vp9_convert_qindex_to_q(qindex);
+ retval = (int)((0.00000828 * q * q * q) +
+ (-0.0055 * q * q) +
+ (1.32 * q) + 79.3);
+ return retval;
+}
+
+static int kfboost_qadjust(int qindex) {
+ int retval;
+ double q;
+
+ q = vp9_convert_qindex_to_q(qindex);
+ retval = (int)((0.00000973 * q * q * q) +
+ (-0.00613 * q * q) +
+ (1.316 * q) + 121.2);
+ return retval;
+}
+
+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {
+ if (frame_type == KEY_FRAME)
+ return (int)(4500000 / vp9_convert_qindex_to_q(qindex));
+ else
+ return (int)(2850000 / vp9_convert_qindex_to_q(qindex));
+}
+
+
+void vp9_save_coding_context(VP9_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ // Stores a snapshot of key state variables which can subsequently be
+ // restored with a call to vp9_restore_coding_context. These functions are
+ // intended for use in a re-code loop in vp9_compress_frame where the
+ // quantizer value is adjusted between loop iterations.
+
+ cc->nmvc = cm->fc.nmvc;
+ vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
+ vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts);
+ vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp);
+
+ vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);
+ vp9_copy(cc->mode_context, cm->fc.mode_context);
+ vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);
+ vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);
+
+ vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
+ vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
+ vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
+ vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
+ vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
+ vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
+
+ // Stats
+#ifdef MODE_STATS
+ vp9_copy(cc->y_modes, y_modes);
+ vp9_copy(cc->uv_modes, uv_modes);
+ vp9_copy(cc->b_modes, b_modes);
+ vp9_copy(cc->inter_y_modes, inter_y_modes);
+ vp9_copy(cc->inter_uv_modes, inter_uv_modes);
+ vp9_copy(cc->inter_b_modes, inter_b_modes);
+#endif
+
+ vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
+ vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);
+ vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);
+ vp9_copy(cc->prob_comppred, cm->prob_comppred);
+
+ vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
+ cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));
+
+ vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
+ vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
+
+ vp9_copy(cc->coef_probs, cm->fc.coef_probs);
+ vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);
+ vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
+ vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
+ vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
+ vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
+ vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
+}
+
+void vp9_restore_coding_context(VP9_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ // Restore key state variables to the snapshot state stored in the
+ // previous call to vp9_save_coding_context.
+
+ cm->fc.nmvc = cc->nmvc;
+ vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+ vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
+ vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+ vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);
+ vp9_copy(cm->fc.mode_context, cc->mode_context);
+ vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);
+ vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);
+
+ vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
+ vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
+ vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
+ vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
+ vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
+ vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
+
+ // Stats
+#ifdef MODE_STATS
+ vp9_copy(y_modes, cc->y_modes);
+ vp9_copy(uv_modes, cc->uv_modes);
+ vp9_copy(b_modes, cc->b_modes);
+ vp9_copy(inter_y_modes, cc->inter_y_modes);
+ vp9_copy(inter_uv_modes, cc->inter_uv_modes);
+ vp9_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+ vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
+ vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);
+ vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);
+ vp9_copy(cm->prob_comppred, cc->prob_comppred);
+
+ vpx_memcpy(cm->last_frame_seg_map,
+ cpi->coding_context.last_frame_seg_map_copy,
+ (cm->mb_rows * cm->mb_cols));
+
+ vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
+ vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
+
+ vp9_copy(cm->fc.coef_probs, cc->coef_probs);
+ vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);
+ vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
+ vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
+ vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
+ vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
+ vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
+}
+
+
+void vp9_setup_key_frame(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ // Setup for Key frame:
+ vp9_default_coef_probs(& cpi->common);
+ vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
+ vp9_init_mbmode_probs(& cpi->common);
+ vp9_default_bmode_probs(cm->fc.bmode_prob);
+
+ vp9_init_mv_probs(& cpi->common);
+
+ // cpi->common.filter_level = 0; // Reset every key frame.
+ cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
+
+ // interval before next GF
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ cpi->common.refresh_golden_frame = TRUE;
+ cpi->common.refresh_alt_ref_frame = TRUE;
+
+ vp9_init_mode_contexts(&cpi->common);
+ vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+ vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+
+ vpx_memset(cm->prev_mip, 0,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+ vpx_memset(cm->mip, 0,
+ (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+
+ vp9_update_mode_info_border(cm, cm->mip);
+ vp9_update_mode_info_in_image(cm, cm->mi);
+}
+
+void vp9_setup_inter_frame(VP9_COMP *cpi) {
+ if (cpi->common.refresh_alt_ref_frame) {
+ vpx_memcpy(&cpi->common.fc,
+ &cpi->common.lfc_a,
+ sizeof(cpi->common.fc));
+ vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
+ cpi->common.fc.mode_context_a,
+ sizeof(cpi->common.fc.vp8_mode_contexts));
+ } else {
+ vpx_memcpy(&cpi->common.fc,
+ &cpi->common.lfc,
+ sizeof(cpi->common.fc));
+ vpx_memcpy(cpi->common.fc.vp8_mode_contexts,
+ cpi->common.fc.mode_context,
+ sizeof(cpi->common.fc.vp8_mode_contexts));
+ }
+}
+
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+ double correction_factor) {
+ int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));
+
+ /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+ * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+ * largest Bpm takes 20 bits.
+ */
+ if (MBs > (1 << 11))
+ return (Bpm >> BPER_MB_NORMBITS) * MBs;
+ else
+ return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP9_COMP *cpi) {
+ // boost defaults to half second
+ int target;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp9_clear_system_state(); // __asm emms;
+
+ // New Two pass RC
+ target = cpi->per_frame_bandwidth;
+
+ if (cpi->oxcf.rc_max_intra_bitrate_pct) {
+ unsigned int max_rate = cpi->per_frame_bandwidth
+ * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+ if (target > max_rate)
+ target = max_rate;
+ }
+
+ cpi->this_frame_target = target;
+
+}
+
+
+// Do the best we can to define the parameteres for the next GF based
+// on what information we have available.
+//
+// In this experimental code only two pass is supported
+// so we just use the interval determined in the two pass code.
+static void calc_gf_params(VP9_COMP *cpi) {
+ // Set the gf interval
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+}
+
+
+static void calc_pframe_target_size(VP9_COMP *cpi) {
+ int min_frame_target;
+
+ min_frame_target = 0;
+
+ min_frame_target = cpi->min_frame_bandwidth;
+
+ if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+ min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+
+
+ // Special alt reference frame case
+ if (cpi->common.refresh_alt_ref_frame) {
+ // Per frame bit target for the alt ref frame
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+
+ // Normal frames (gf,and inter)
+ else {
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+
+ // Sanity check that the total sum of adjustments is not above the maximum allowed
+ // That is that having allowed for KF and GF penalties we have not pushed the
+ // current interframe target to low. If the adjustment we apply here is not capable of recovering
+ // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
+ // a longer time span via other buffer / rate control mechanisms.
+ if (cpi->this_frame_target < min_frame_target)
+ cpi->this_frame_target = min_frame_target;
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ // Note the baseline target data rate for this inter frame.
+ cpi->inter_frame_target = cpi->this_frame_target;
+
+ // Adjust target frame size for Golden Frames:
+ if (cpi->frames_till_gf_update_due == 0) {
+ // int Boost = 0;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+ cpi->common.refresh_golden_frame = TRUE;
+
+ calc_gf_params(cpi);
+
+ // If we are using alternate ref instead of gf then do not apply the boost
+ // It will instead be applied to the altref update
+ // Jims modified boost
+ if (!cpi->source_alt_ref_active) {
+ if (cpi->oxcf.fixed_q < 0) {
+ // The spend on the GF is defined in the two pass code
+ // for two pass encodes
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ } else
+ cpi->this_frame_target =
+ (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+ * cpi->last_boost) / 100;
+
+ }
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a contructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for contstructed ARFs.
+ else {
+ cpi->this_frame_target = 0;
+ }
+
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+ }
+}
+
+
+void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+ int Q = cpi->common.base_qindex;
+ int correction_factor = 100;
+ double rate_correction_factor;
+ double adjustment_limit;
+
+ int projected_size_based_on_q = 0;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vp9_clear_system_state(); // __asm emms;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ rate_correction_factor = cpi->key_frame_rate_correction_factor;
+ } else {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ rate_correction_factor = cpi->gf_rate_correction_factor;
+ else
+ rate_correction_factor = cpi->rate_correction_factor;
+ }
+
+ // Work out how big we would have expected the frame to be at this Q given the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ projected_size_based_on_q =
+ (int)(((.5 + rate_correction_factor *
+ vp9_bits_per_mb(cpi->common.frame_type, Q)) *
+ cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+ // Make some allowance for cpi->zbin_over_quant
+ if (cpi->zbin_over_quant > 0) {
+ int Z = cpi->zbin_over_quant;
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
+
+ while (Z > 0) {
+ Z--;
+ projected_size_based_on_q =
+ (int)(Factor * projected_size_based_on_q);
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+ }
+ }
+
+ // Work out a size correction factor.
+ // if ( cpi->this_frame_target > 0 )
+ // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
+ if (projected_size_based_on_q > 0)
+ correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+ // More heavily damped adjustment used if we have been oscillating either side of target
+ switch (damp_var) {
+ case 0:
+ adjustment_limit = 0.75;
+ break;
+ case 1:
+ adjustment_limit = 0.375;
+ break;
+ case 2:
+ default:
+ adjustment_limit = 0.25;
+ break;
+ }
+
+ // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
+ if (correction_factor > 102) {
+ // We are not already at the worst allowable quality
+ correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ }
+ // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
+ else if (correction_factor < 99) {
+ // We are not already at the best allowable quality
+ correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->key_frame_rate_correction_factor = rate_correction_factor;
+ else {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ cpi->gf_rate_correction_factor = rate_correction_factor;
+ else
+ cpi->rate_correction_factor = rate_correction_factor;
+ }
+}
+
+
+int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
+ int Q = cpi->active_worst_quality;
+
+ int i;
+ int last_error = INT_MAX;
+ int target_bits_per_mb;
+ int bits_per_mb_at_this_q;
+ double correction_factor;
+
+ // Reset Zbin OQ value
+ cpi->zbin_over_quant = 0;
+
+ // Select the appropriate correction factor based upon type of frame.
+ if (cpi->common.frame_type == KEY_FRAME)
+ correction_factor = cpi->key_frame_rate_correction_factor;
+ else {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ correction_factor = cpi->gf_rate_correction_factor;
+ else
+ correction_factor = cpi->rate_correction_factor;
+ }
+
+ // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+ if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+ target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int
+ else
+ target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+ i = cpi->active_best_quality;
+
+ do {
+ bits_per_mb_at_this_q =
+ (int)(.5 + correction_factor *
+ vp9_bits_per_mb(cpi->common.frame_type, i));
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ Q = i;
+ else
+ Q = i - 1;
+
+ break;
+ } else
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ } while (++i <= cpi->active_worst_quality);
+
+
+ // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
+ // the RD multiplier and zero bin size.
+ if (Q >= MAXQ) {
+ int zbin_oqmax;
+
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ zbin_oqmax = 0; // ZBIN_OQ_MAX/16
+ else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+ zbin_oqmax = 16;
+ else
+ zbin_oqmax = ZBIN_OQ_MAX;
+
+ // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
+ // The effect will be highly clip dependent and may well have sudden steps.
+ // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
+ // bin and hence decreasing the number of low magnitude non zero coefficients.
+ while (cpi->zbin_over_quant < zbin_oqmax) {
+ cpi->zbin_over_quant++;
+
+ if (cpi->zbin_over_quant > zbin_oqmax)
+ cpi->zbin_over_quant = zbin_oqmax;
+
+ // Adjust bits_per_mb_at_this_q estimate
+ bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate
+ break;
+ }
+
+ }
+
+ return Q;
+}
+
+
+static int estimate_keyframe_frequency(VP9_COMP *cpi) {
+ int i;
+
+ // Average key frame frequency
+ int av_key_frame_frequency = 0;
+
+ /* First key frame at start of sequence is a special case. We have no
+ * frequency data.
+ */
+ if (cpi->key_frame_count == 1) {
+ /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+ * whichever is smaller.
+ */
+ int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
+ av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+
+ if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+ av_key_frame_frequency = cpi->oxcf.key_freq;
+
+ cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+ = av_key_frame_frequency;
+ } else {
+ unsigned int total_weight = 0;
+ int last_kf_interval =
+ (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
+
+ /* reset keyframe context and calculate weighted average of last
+ * KEY_FRAME_CONTEXT keyframes
+ */
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
+ if (i < KEY_FRAME_CONTEXT - 1)
+ cpi->prior_key_frame_distance[i]
+ = cpi->prior_key_frame_distance[i + 1];
+ else
+ cpi->prior_key_frame_distance[i] = last_kf_interval;
+
+ av_key_frame_frequency += prior_key_frame_weight[i]
+ * cpi->prior_key_frame_distance[i];
+ total_weight += prior_key_frame_weight[i];
+ }
+
+ av_key_frame_frequency /= total_weight;
+
+ }
+ return av_key_frame_frequency;
+}
+
+
+void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+ // Clear down mmx registers to allow floating point in what follows
+ vp9_clear_system_state();
+
+ cpi->frames_since_key = 0;
+ cpi->key_frame_count++;
+}
+
+
+void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ // Set-up bounds on acceptable frame size:
+ if (cpi->oxcf.fixed_q >= 0) {
+ // Fixed Q scenario: frame size never outranges target (there is no target!)
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ if (cpi->common.frame_type == KEY_FRAME) {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ } else {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ } else {
+ // Stron overshoot limit for constrained quality
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+ } else {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+ }
+ }
+ }
+
+ // For very small rate targets where the fractional adjustment
+ // (eg * 7/8) may be tiny make sure there is at least a minimum
+ // range.
+ *frame_over_shoot_limit += 200;
+ *frame_under_shoot_limit -= 200;
+ if (*frame_under_shoot_limit < 0)
+ *frame_under_shoot_limit = 0;
+ }
+}
+
+
+// return of 0 means drop frame
+int vp9_pick_frame_size(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ if (cm->frame_type == KEY_FRAME)
+ calc_iframe_target_size(cpi);
+ else
+ calc_pframe_target_size(cpi);
+
+ return 1;
+}
--- /dev/null
+++ b/vp9/encoder/ratectrl.h
@@ -1,0 +1,37 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+#define FRAME_OVERHEAD_BITS 200
+
+extern void vp9_save_coding_context(VP9_COMP *cpi);
+extern void vp9_restore_coding_context(VP9_COMP *cpi);
+
+extern void vp9_setup_key_frame(VP9_COMP *cpi);
+extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
+extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);
+extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+// return of 0 means drop frame
+extern int vp9_pick_frame_size(VP9_COMP *cpi);
+
+extern double vp9_convert_qindex_to_q(int qindex);
+extern int vp9_gfboost_qadjust(int qindex);
+extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/rdopt.c
@@ -1,0 +1,4854 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "vp9/common/pragmas.h"
+
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "vp9/common/entropymode.h"
+#include "vp9/common/reconinter.h"
+#include "vp9/common/reconintra.h"
+#include "vp9/common/reconintra4x4.h"
+#include "vp9/common/findnearmv.h"
+#include "vp9/common/quant_common.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp9/common/idct.h"
+#include "variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "ratectrl.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/systemdependent.h"
+#include "vp9/encoder/encodemv.h"
+
+#include "vp9/common/seg_common.h"
+#include "vp9/common/pred_common.h"
+#include "vp9/common/entropy.h"
+#include "vpx_rtcd.h"
+#if CONFIG_NEWBESTREFMV
+#include "vp9/common/mvref_common.h"
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);
+extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);
+
+#define MAXF(a,b) (((a) > (b)) ? (a) : (b))
+
+#define INVALID_MV 0x80008000
+
+/* Factor to weigh the rate for switchable interp filters */
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+static const int auto_speed_thresh[17] = {
+ 1000,
+ 200,
+ 150,
+ 130,
+ 150,
+ 125,
+ 120,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 105
+};
+
+#if CONFIG_PRED_FILTER
+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+ {ZEROMV, LAST_FRAME, 0, 0},
+ {ZEROMV, LAST_FRAME, 0, 1},
+ {DC_PRED, INTRA_FRAME, 0, 0},
+
+ {NEARESTMV, LAST_FRAME, 0, 0},
+ {NEARESTMV, LAST_FRAME, 0, 1},
+ {NEARMV, LAST_FRAME, 0, 0},
+ {NEARMV, LAST_FRAME, 0, 1},
+
+ {ZEROMV, GOLDEN_FRAME, 0, 0},
+ {ZEROMV, GOLDEN_FRAME, 0, 1},
+ {NEARESTMV, GOLDEN_FRAME, 0, 0},
+ {NEARESTMV, GOLDEN_FRAME, 0, 1},
+
+ {ZEROMV, ALTREF_FRAME, 0, 0},
+ {ZEROMV, ALTREF_FRAME, 0, 1},
+ {NEARESTMV, ALTREF_FRAME, 0, 0},
+ {NEARESTMV, ALTREF_FRAME, 0, 1},
+
+ {NEARMV, GOLDEN_FRAME, 0, 0},
+ {NEARMV, GOLDEN_FRAME, 0, 1},
+ {NEARMV, ALTREF_FRAME, 0, 0},
+ {NEARMV, ALTREF_FRAME, 0, 1},
+
+ {V_PRED, INTRA_FRAME, 0, 0},
+ {H_PRED, INTRA_FRAME, 0, 0},
+ {D45_PRED, INTRA_FRAME, 0, 0},
+ {D135_PRED, INTRA_FRAME, 0, 0},
+ {D117_PRED, INTRA_FRAME, 0, 0},
+ {D153_PRED, INTRA_FRAME, 0, 0},
+ {D27_PRED, INTRA_FRAME, 0, 0},
+ {D63_PRED, INTRA_FRAME, 0, 0},
+
+ {TM_PRED, INTRA_FRAME, 0, 0},
+
+ {NEWMV, LAST_FRAME, 0, 0},
+ {NEWMV, LAST_FRAME, 0, 1},
+ {NEWMV, GOLDEN_FRAME, 0, 0},
+ {NEWMV, GOLDEN_FRAME, 0, 1},
+ {NEWMV, ALTREF_FRAME, 0, 0},
+ {NEWMV, ALTREF_FRAME, 0, 1},
+
+ {SPLITMV, LAST_FRAME, 0, 0},
+ {SPLITMV, GOLDEN_FRAME, 0, 0},
+ {SPLITMV, ALTREF_FRAME, 0, 0},
+
+ {B_PRED, INTRA_FRAME, 0, 0},
+ {I8X8_PRED, INTRA_FRAME, 0, 0},
+
+ /* compound prediction modes */
+ {ZEROMV, LAST_FRAME, GOLDEN_FRAME, 0},
+ {NEARESTMV, LAST_FRAME, GOLDEN_FRAME, 0},
+ {NEARMV, LAST_FRAME, GOLDEN_FRAME, 0},
+
+ {ZEROMV, ALTREF_FRAME, LAST_FRAME, 0},
+ {NEARESTMV, ALTREF_FRAME, LAST_FRAME, 0},
+ {NEARMV, ALTREF_FRAME, LAST_FRAME, 0},
+
+ {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
+ {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
+ {NEARMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
+
+ {NEWMV, LAST_FRAME, GOLDEN_FRAME, 0},
+ {NEWMV, ALTREF_FRAME, LAST_FRAME, 0},
+ {NEWMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
+
+ {SPLITMV, LAST_FRAME, GOLDEN_FRAME, 0},
+ {SPLITMV, ALTREF_FRAME, LAST_FRAME, 0},
+ {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME, 0}
+};
+#else
+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+ {ZEROMV, LAST_FRAME, 0},
+ {DC_PRED, INTRA_FRAME, 0},
+
+ {NEARESTMV, LAST_FRAME, 0},
+ {NEARMV, LAST_FRAME, 0},
+
+ {ZEROMV, GOLDEN_FRAME, 0},
+ {NEARESTMV, GOLDEN_FRAME, 0},
+
+ {ZEROMV, ALTREF_FRAME, 0},
+ {NEARESTMV, ALTREF_FRAME, 0},
+
+ {NEARMV, GOLDEN_FRAME, 0},
+ {NEARMV, ALTREF_FRAME, 0},
+
+ {V_PRED, INTRA_FRAME, 0},
+ {H_PRED, INTRA_FRAME, 0},
+ {D45_PRED, INTRA_FRAME, 0},
+ {D135_PRED, INTRA_FRAME, 0},
+ {D117_PRED, INTRA_FRAME, 0},
+ {D153_PRED, INTRA_FRAME, 0},
+ {D27_PRED, INTRA_FRAME, 0},
+ {D63_PRED, INTRA_FRAME, 0},
+
+ {TM_PRED, INTRA_FRAME, 0},
+
+ {NEWMV, LAST_FRAME, 0},
+ {NEWMV, GOLDEN_FRAME, 0},
+ {NEWMV, ALTREF_FRAME, 0},
+
+ {SPLITMV, LAST_FRAME, 0},
+ {SPLITMV, GOLDEN_FRAME, 0},
+ {SPLITMV, ALTREF_FRAME, 0},
+
+ {B_PRED, INTRA_FRAME, 0},
+ {I8X8_PRED, INTRA_FRAME, 0},
+
+ /* compound prediction modes */
+ {ZEROMV, LAST_FRAME, GOLDEN_FRAME},
+ {NEARESTMV, LAST_FRAME, GOLDEN_FRAME},
+ {NEARMV, LAST_FRAME, GOLDEN_FRAME},
+
+ {ZEROMV, ALTREF_FRAME, LAST_FRAME},
+ {NEARESTMV, ALTREF_FRAME, LAST_FRAME},
+ {NEARMV, ALTREF_FRAME, LAST_FRAME},
+
+ {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {NEWMV, LAST_FRAME, GOLDEN_FRAME},
+ {NEWMV, ALTREF_FRAME, LAST_FRAME },
+ {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
+
+ {SPLITMV, LAST_FRAME, GOLDEN_FRAME},
+ {SPLITMV, ALTREF_FRAME, LAST_FRAME },
+ {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}
+};
+#endif
+
+static void fill_token_costs(
+ unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+ const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],
+ int block_type_counts) {
+ int i, j, k;
+
+ for (i = 0; i < block_type_counts; i++)
+ for (j = 0; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+ if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))
+ vp9_cost_tokens_skip((int *)(c[i][j][k]),
+ p[i][j][k],
+ vp9_coef_tree);
+ else
+ vp9_cost_tokens((int *)(c[i][j][k]),
+ p[i][j][k],
+ vp9_coef_tree);
+ }
+}
+
+
+static int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, };
+
+// 3* dc_qlookup[Q]*dc_qlookup[Q];
+
+/* values are now correlated to quantizer */
+static int sad_per_bit16lut[QINDEX_RANGE];
+static int sad_per_bit4lut[QINDEX_RANGE];
+
+void vp9_init_me_luts() {
+ int i;
+
+ // Initialize the sad lut tables using a formulaic calculation for now
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ sad_per_bit16lut[i] =
+ (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
+ sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+ }
+}
+
+static int compute_rd_mult(int qindex) {
+ int q;
+
+ q = vp9_dc_quant(qindex, 0);
+ return (11 * q * q) >> 6;
+}
+
+void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {
+ cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex];
+ cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex];
+}
+
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
+ int q, i;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ // Further tests required to see if optimum is different
+ // for key frames, golden frames and arf frames.
+ // if (cpi->common.refresh_golden_frame ||
+ // cpi->common.refresh_alt_ref_frame)
+ QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);
+
+ cpi->RDMULT = compute_rd_mult(QIndex);
+
+ // Extend rate multiplier along side quantizer zbin increases
+ if (cpi->zbin_over_quant > 0) {
+ double oq_factor;
+
+ // Experimental code using the same basic equation as used for Q above
+ // The units of cpi->zbin_over_quant are 1/128 of Q bin size
+ oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+ cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);
+ }
+
+ if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ if (cpi->twopass.next_iiratio > 31)
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+ else
+ cpi->RDMULT +=
+ (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+ }
+
+ if (cpi->RDMULT < 7)
+ cpi->RDMULT = 7;
+
+ cpi->mb.errorperbit = (cpi->RDMULT / 110);
+ cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+ vp9_set_speed_features(cpi);
+
+ q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);
+ q = q << 2;
+ cpi->RDMULT = cpi->RDMULT << 4;
+
+ if (q < 8)
+ q = 8;
+
+ if (cpi->RDMULT > 1000) {
+ cpi->RDDIV = 1;
+ cpi->RDMULT /= 100;
+
+ for (i = 0; i < MAX_MODES; i++) {
+ if (cpi->sf.thresh_mult[i] < INT_MAX) {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+ } else {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ } else {
+ cpi->RDDIV = 100;
+
+ for (i = 0; i < MAX_MODES; i++) {
+ if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+ } else {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ }
+
+ fill_token_costs(
+ cpi->mb.token_costs[TX_4X4],
+ (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
+ BLOCK_TYPES);
+ fill_token_costs(
+ cpi->mb.hybrid_token_costs[TX_4X4],
+ (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
+ cpi->common.fc.hybrid_coef_probs,
+ BLOCK_TYPES);
+
+ fill_token_costs(
+ cpi->mb.token_costs[TX_8X8],
+ (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
+ BLOCK_TYPES_8X8);
+ fill_token_costs(
+ cpi->mb.hybrid_token_costs[TX_8X8],
+ (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
+ cpi->common.fc.hybrid_coef_probs_8x8,
+ BLOCK_TYPES_8X8);
+
+ fill_token_costs(
+ cpi->mb.token_costs[TX_16X16],
+ (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
+ BLOCK_TYPES_16X16);
+ fill_token_costs(
+ cpi->mb.hybrid_token_costs[TX_16X16],
+ (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])
+ cpi->common.fc.hybrid_coef_probs_16x16,
+ BLOCK_TYPES_16X16);
+
+ /*rough estimate for costing*/
+ cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
+ vp9_init_mode_costs(cpi);
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ vp9_build_nmv_cost_table(
+ cpi->mb.nmvjointcost,
+ cpi->mb.e_mbd.allow_high_precision_mv ?
+ cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
+ &cpi->common.fc.nmvc,
+ cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+ }
+}
+
+void vp9_auto_select_speed(VP9_COMP *cpi) {
+ int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+
+ milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+ /*
+ // this is done during parameter valid check
+ if( cpi->oxcf.cpu_used > 16)
+ cpi->oxcf.cpu_used = 16;
+ if( cpi->oxcf.cpu_used < -16)
+ cpi->oxcf.cpu_used = -16;
+ */
+
+ if (cpi->avg_pick_mode_time < milliseconds_for_compress &&
+ (cpi->avg_encode_time - cpi->avg_pick_mode_time) <
+ milliseconds_for_compress) {
+ if (cpi->avg_pick_mode_time == 0) {
+ cpi->Speed = 4;
+ } else {
+ if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {
+ cpi->Speed += 2;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ if (cpi->Speed > 16) {
+ cpi->Speed = 16;
+ }
+ }
+
+ if (milliseconds_for_compress * 100 >
+ cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {
+ cpi->Speed -= 1;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ // In real-time mode, cpi->speed is in [4, 16].
+ if (cpi->Speed < 4) { // if ( cpi->Speed < 0 )
+ cpi->Speed = 4; // cpi->Speed = 0;
+ }
+ }
+ }
+ } else {
+ cpi->Speed += 4;
+
+ if (cpi->Speed > 16)
+ cpi->Speed = 16;
+
+
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+ }
+}
+
+int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {
+ int i, error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ int this_diff = coeff[i] - dqcoeff[i];
+ error += this_diff * this_diff;
+ }
+
+ return error;
+}
+
+int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {
+ BLOCK *be;
+ BLOCKD *bd;
+ int i, j;
+ int berror, error = 0;
+
+ for (i = 0; i < 16; i++) {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ berror = 0;
+
+ for (j = dc; j < 16; j++) {
+ int this_diff = be->coeff[j] - bd->dqcoeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ return error;
+}
+
+int vp9_mbuverror_c(MACROBLOCK *mb) {
+ BLOCK *be;
+ BLOCKD *bd;
+
+ int i, error = 0;
+
+ for (i = 16; i < 24; i++) {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
+ }
+
+ return error;
+}
+
+int vp9_uvsse(MACROBLOCK *x) {
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+ unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+ int uv_stride = x->block[16].src_stride;
+
+ unsigned int sse1 = 0;
+ unsigned int sse2 = 0;
+ int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;
+ int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;
+ int offset;
+ int pre_stride = x->e_mbd.block[16].pre_stride;
+
+ if (mv_row < 0)
+ mv_row -= 1;
+ else
+ mv_row += 1;
+
+ if (mv_col < 0)
+ mv_col -= 1;
+ else
+ mv_col += 1;
+
+ mv_row /= 2;
+ mv_col /= 2;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->e_mbd.pre.u_buffer + offset;
+ vptr = x->e_mbd.pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7) {
+ vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
+ (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
+ vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
+ (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ } else {
+ vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
+ vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ }
+ return sse2;
+
+}
+
+static int cost_coeffs_2x2(MACROBLOCK *mb,
+ BLOCKD *b, PLANE_TYPE type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+ int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
+ int eob = b->eob;
+ int pt; /* surrounding block/prev coef predictor */
+ int cost = 0;
+ short *qcoeff_ptr = b->qcoeff;
+
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ assert(eob <= 4);
+
+ for (; c < eob; c++) {
+ int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];
+ int t = vp9_dct_value_tokens_ptr[v].Token;
+ cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];
+ cost += vp9_dct_value_cost_ptr[v];
+ pt = vp9_prev_token_class[t];
+ }
+
+ if (c < 4)
+ cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]
+ [pt] [DCT_EOB_TOKEN];
+
+ pt = (c != !type); // is eob first coefficient;
+ *a = *l = pt;
+ return cost;
+}
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int tx_size) {
+ const int eob = b->eob;
+ int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
+ int cost = 0, default_eob, seg_eob;
+ int pt; /* surrounding block/prev coef predictor */
+ int const *scan, *band;
+ short *qcoeff_ptr = b->qcoeff;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
+ TX_TYPE tx_type = DCT_DCT;
+ int segment_id = mbmi->segment_id;
+
+ switch (tx_size) {
+ case TX_4X4:
+ scan = vp9_default_zig_zag1d;
+ band = vp9_coef_bands;
+ default_eob = 16;
+ if (type == PLANE_TYPE_Y_WITH_DC) {
+ tx_type = get_tx_type_4x4(xd, b);
+ if (tx_type != DCT_DCT) {
+ switch (tx_type) {
+ case ADST_DCT:
+ scan = vp9_row_scan;
+ break;
+
+ case DCT_ADST:
+ scan = vp9_col_scan;
+ break;
+
+ default:
+ scan = vp9_default_zig_zag1d;
+ break;
+ }
+ }
+ }
+
+ break;
+ case TX_8X8:
+ scan = vp9_default_zig_zag1d_8x8;
+ band = vp9_coef_bands_8x8;
+ default_eob = 64;
+ if (type == PLANE_TYPE_Y_WITH_DC) {
+ BLOCKD *bb;
+ int ib = (b - xd->block);
+ if (ib < 16) {
+ ib = (ib & 8) + ((ib & 4) >> 1);
+ bb = xd->block + ib;
+ tx_type = get_tx_type_8x8(xd, bb);
+ }
+ }
+ break;
+ case TX_16X16:
+ scan = vp9_default_zig_zag1d_16x16;
+ band = vp9_coef_bands_16x16;
+ default_eob = 256;
+ if (type == PLANE_TYPE_Y_WITH_DC) {
+ tx_type = get_tx_type_16x16(xd, b);
+ }
+ break;
+ default:
+ break;
+ }
+ if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
+ seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);
+ else
+ seg_eob = default_eob;
+
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ if (tx_type != DCT_DCT) {
+ for (; c < eob; c++) {
+ int v = qcoeff_ptr[scan[c]];
+ int t = vp9_dct_value_tokens_ptr[v].Token;
+ cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];
+ cost += vp9_dct_value_cost_ptr[v];
+ pt = vp9_prev_token_class[t];
+ }
+ if (c < seg_eob)
+ cost += mb->hybrid_token_costs[tx_size][type][band[c]]
+ [pt][DCT_EOB_TOKEN];
+ } else {
+ for (; c < eob; c++) {
+ int v = qcoeff_ptr[scan[c]];
+ int t = vp9_dct_value_tokens_ptr[v].Token;
+ cost += mb->token_costs[tx_size][type][band[c]][pt][t];
+ cost += vp9_dct_value_cost_ptr[v];
+ pt = vp9_prev_token_class[t];
+ }
+ if (c < seg_eob)
+ cost += mb->token_costs[tx_size][type][band[c]]
+ [pt][DCT_EOB_TOKEN];
+ }
+
+ pt = (c != !type); // is eob first coefficient;
+ *a = *l = pt;
+ return cost;
+}
+
+static int rdcost_mby_4x4(MACROBLOCK *mb) {
+ int cost = 0;
+ int b;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 0; b < 16; b++)
+ cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
+ ta + vp9_block2above[b], tl + vp9_block2left[b],
+ TX_4X4);
+
+ cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
+ ta + vp9_block2above[24], tl + vp9_block2left[24],
+ TX_4X4);
+
+ return cost;
+}
+
+static void macro_block_yrd_4x4(MACROBLOCK *mb,
+ int *Rate,
+ int *Distortion,
+ const VP9_ENCODER_RTCD *rtcd,
+ int *skippable) {
+ int b;
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ BLOCK *const mb_y2 = mb->block + 24;
+ BLOCKD *const x_y2 = xd->block + 24;
+ short *Y2DCPtr = mb_y2->src_diff;
+ BLOCK *beptr;
+ int d;
+
+ vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
+ mb->block[0].src_stride);
+
+ // Fdct and building the 2nd order block
+ for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {
+ mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+ *Y2DCPtr++ = beptr->coeff[0];
+ *Y2DCPtr++ = beptr->coeff[16];
+ }
+
+ // 2nd order fdct
+ mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+ // Quantization
+ for (b = 0; b < 16; b++) {
+ mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);
+ }
+
+ // DC predication and Quantization of 2nd Order block
+ mb->quantize_b_4x4(mb_y2, x_y2);
+
+ // Distortion
+ d = vp9_mbblock_error(mb, 1);
+
+ d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
+
+ *Distortion = (d >> 2);
+ // rate
+ *Rate = rdcost_mby_4x4(mb);
+ *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
+}
+
+static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
+ int cost = 0;
+ int b;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ if (backup) {
+ vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ } else {
+ ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+ tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+ }
+
+ for (b = 0; b < 16; b += 4)
+ cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
+ ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+ TX_8X8);
+
+ cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,
+ ta + vp9_block2above[24], tl + vp9_block2left[24]);
+ return cost;
+}
+
+static void macro_block_yrd_8x8(MACROBLOCK *mb,
+ int *Rate,
+ int *Distortion,
+ const VP9_ENCODER_RTCD *rtcd,
+ int *skippable) {
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ BLOCK *const mb_y2 = mb->block + 24;
+ BLOCKD *const x_y2 = xd->block + 24;
+ int d;
+
+ vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,
+ mb->block[0].src_stride);
+
+ vp9_transform_mby_8x8(mb);
+ vp9_quantize_mby_8x8(mb);
+
+ /* remove 1st order dc to properly combine 1st/2nd order distortion */
+ mb->coeff[0] = 0;
+ mb->coeff[64] = 0;
+ mb->coeff[128] = 0;
+ mb->coeff[192] = 0;
+ xd->dqcoeff[0] = 0;
+ xd->dqcoeff[64] = 0;
+ xd->dqcoeff[128] = 0;
+ xd->dqcoeff[192] = 0;
+
+ d = vp9_mbblock_error(mb, 0);
+ d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);
+
+ *Distortion = (d >> 2);
+ // rate
+ *Rate = rdcost_mby_8x8(mb, 1);
+ *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
+}
+
+static int rdcost_mby_16x16(MACROBLOCK *mb) {
+ int cost;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+
+ vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
+ return cost;
+}
+
+static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
+ const VP9_ENCODER_RTCD *rtcd, int *skippable) {
+ int d;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ BLOCKD *b = &mb->e_mbd.block[0];
+ BLOCK *be = &mb->block[0];
+ TX_TYPE tx_type;
+
+ vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,
+ mb->block[0].src_stride);
+
+ tx_type = get_tx_type_16x16(xd, b);
+ if (tx_type != DCT_DCT) {
+ vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);
+ } else
+ vp9_transform_mby_16x16(mb);
+
+ vp9_quantize_mby_16x16(mb);
+ // TODO(jingning) is it possible to quickly determine whether to force
+ // trailing coefficients to be zero, instead of running trellis
+ // optimization in the rate-distortion optimization loop?
+ if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
+ vp9_optimize_mby_16x16(mb, rtcd);
+
+ d = vp9_mbblock_error(mb, 0);
+
+ *Distortion = (d >> 2);
+ // rate
+ *Rate = rdcost_mby_16x16(mb);
+ *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
+}
+
+static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int *skippable,
+ int64_t txfm_cache[NB_TXFM_MODES]) {
+ VP9_COMMON *cm = &cpi->common;
+ MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ int can_skip = cm->mb_no_coeff_skip;
+ vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+ int s0, s1;
+ int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
+ int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
+ int d16x16, r16x16, r16x16s, s16x16;
+ int64_t rd16x16, rd16x16s;
+
+ // FIXME don't do sub x3
+ if (skip_prob == 0)
+ skip_prob = 1;
+ s0 = vp9_cost_bit(skip_prob, 0);
+ s1 = vp9_cost_bit(skip_prob, 1);
+ macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
+ if (can_skip) {
+ if (s16x16) {
+ rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+ } else {
+ rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
+ }
+ } else {
+ rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
+ }
+ r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
+ if (can_skip) {
+ if (s16x16) {
+ rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
+ } else {
+ rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
+ }
+ } else {
+ rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
+ }
+ macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
+ if (can_skip) {
+ if (s8x8) {
+ rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+ } else {
+ rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
+ }
+ } else {
+ rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
+ }
+ r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
+ r8x8s += vp9_cost_zero(cm->prob_tx[1]);
+ if (can_skip) {
+ if (s8x8) {
+ rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
+ } else {
+ rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
+ }
+ } else {
+ rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
+ }
+ macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
+ if (can_skip) {
+ if (s4x4) {
+ rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+ } else {
+ rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
+ }
+ } else {
+ rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
+ }
+ r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
+ if (can_skip) {
+ if (s4x4) {
+ rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
+ } else {
+ rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
+ }
+ } else {
+ rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
+ }
+
+ if ( cpi->common.txfm_mode == ALLOW_16X16 ||
+ (cpi->common.txfm_mode == TX_MODE_SELECT &&
+ rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
+ mbmi->txfm_size = TX_16X16;
+ *skippable = s16x16;
+ *distortion = d16x16;
+ *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
+ } else
+ if ( cpi->common.txfm_mode == ALLOW_8X8 ||
+ (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
+ mbmi->txfm_size = TX_8X8;
+ *skippable = s8x8;
+ *distortion = d8x8;
+ *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
+ } else {
+ assert(cpi->common.txfm_mode == ONLY_4X4 ||
+ (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
+ mbmi->txfm_size = TX_4X4;
+ *skippable = s4x4;
+ *distortion = d4x4;
+ *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
+ }
+
+ txfm_cache[ONLY_4X4] = rd4x4;
+ txfm_cache[ALLOW_8X8] = rd8x8;
+ txfm_cache[ALLOW_16X16] = rd16x16;
+ if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
+ txfm_cache[TX_MODE_SELECT] = rd16x16s;
+ else
+ txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
+
+}
+
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
+ const unsigned int *p = (const unsigned int *)predictor;
+ unsigned int *d = (unsigned int *)dst;
+ d[0] = p[0];
+ d[4] = p[4];
+ d[8] = p[8];
+ d[12] = p[12];
+}
+
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+ int *rate,
+ int *distortion,
+ const VP9_ENCODER_RTCD *rtcd, int *skip)
+{
+ MACROBLOCKD *const xd = &x->e_mbd;
+ BLOCK *const by2 = x->block + 24;
+ BLOCKD *const bdy2 = xd->block + 24;
+ int d = 0, r = 0, n;
+ const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+ int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+ ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+ ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+ ENTROPY_CONTEXT_PLANES t_above[2];
+ ENTROPY_CONTEXT_PLANES t_left[2];
+ int skippable = 1;
+
+ vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+ vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+ for (n = 0; n < 4; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+
+ vp9_subtract_mby_s_c(x->src_diff,
+ src + x_idx * 16 + y_idx * 16 * src_y_stride,
+ src_y_stride,
+ dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+ dst_y_stride);
+ vp9_transform_mby_8x8(x);
+ vp9_quantize_mby_8x8(x);
+
+ /* remove 1st order dc to properly combine 1st/2nd order distortion */
+ x->coeff[ 0] = 0;
+ x->coeff[ 64] = 0;
+ x->coeff[128] = 0;
+ x->coeff[192] = 0;
+ xd->dqcoeff[ 0] = 0;
+ xd->dqcoeff[ 64] = 0;
+ xd->dqcoeff[128] = 0;
+ xd->dqcoeff[192] = 0;
+
+ d += vp9_mbblock_error(x, 0);
+ d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
+ xd->above_context = ta + x_idx;
+ xd->left_context = tl + y_idx;
+ r += rdcost_mby_8x8(x, 0);
+ skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
+ }
+
+ *distortion = (d >> 2);
+ *rate = r;
+ if (skip) *skip = skippable;
+ xd->above_context = ta;
+ xd->left_context = tl;
+ vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+ vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
+static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
+ const unsigned int *p = (const unsigned int *)predictor;
+ unsigned int *d = (unsigned int *)dst;
+ d[0] = p[0];
+ d[1] = p[1];
+ d[4] = p[4];
+ d[5] = p[5];
+ d[8] = p[8];
+ d[9] = p[9];
+ d[12] = p[12];
+ d[13] = p[13];
+ d[16] = p[16];
+ d[17] = p[17];
+ d[20] = p[20];
+ d[21] = p[21];
+ d[24] = p[24];
+ d[25] = p[25];
+ d[28] = p[28];
+ d[29] = p[29];
+}
+
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
+ BLOCKD *b, B_PREDICTION_MODE *best_mode,
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE *best_second_mode,
+ int allow_comp,
+#endif
+ int *bmode_costs,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int *bestrate, int *bestratey,
+ int *bestdistortion) {
+ B_PREDICTION_MODE mode;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE mode2;
+#endif
+ int64_t best_rd = INT64_MAX;
+ int rate = 0;
+ int distortion;
+
+ ENTROPY_CONTEXT ta = *a, tempa = *a;
+ ENTROPY_CONTEXT tl = *l, templ = *l;
+ TX_TYPE tx_type = DCT_DCT;
+ TX_TYPE best_tx_type = DCT_DCT;
+ /*
+ * The predictor buffer is a 2d buffer with a stride of 16. Create
+ * a temp buffer that meets the stride requirements, but we are only
+ * interested in the left 4x4 block
+ * */
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16 * 4);
+ DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+
+ for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+ for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
+ mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
+#endif
+ int64_t this_rd;
+ int ratey;
+
+ b->bmi.as_mode.first = mode;
+ rate = bmode_costs[mode];
+
+#if CONFIG_COMP_INTRA_PRED
+ if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
+#endif
+ vp9_intra4x4_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);
+ rate += bmode_costs[mode2];
+ }
+#endif
+ vp9_subtract_b(be, b, 16);
+
+ b->bmi.as_mode.first = mode;
+ tx_type = get_tx_type_4x4(xd, b);
+ if (tx_type != DCT_DCT) {
+ vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
+ vp9_ht_quantize_b_4x4(be, b, tx_type);
+ } else {
+ x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b_4x4(be, b);
+ }
+
+ tempa = ta;
+ templ = tl;
+
+ ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+ rate += ratey;
+ distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ best_tx_type = tx_type;
+
+#if CONFIG_COMP_INTRA_PRED
+ *best_second_mode = mode2;
+#endif
+ *a = tempa;
+ *l = templ;
+ copy_predictor(best_predictor, b->predictor);
+ vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+ }
+#if CONFIG_COMP_INTRA_PRED
+ }
+#endif
+ }
+ b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
+#if CONFIG_COMP_INTRA_PRED
+ b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
+#endif
+
+ // inverse transform
+ if (best_tx_type != DCT_DCT)
+ vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
+ else
+ IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
+ best_dqcoeff, b->diff, 32);
+
+ vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+ return best_rd;
+}
+
+static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,
+ int *rate_y, int *Distortion, int64_t best_rd,
+#if CONFIG_COMP_INTRA_PRED
+ int allow_comp,
+#endif
+ int update_contexts) {
+ int i;
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+ int distortion = 0;
+ int tot_rate_y = 0;
+ int64_t total_rd = 0;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+ int *bmode_costs;
+
+ if (update_contexts) {
+ ta = (ENTROPY_CONTEXT *)xd->above_context;
+ tl = (ENTROPY_CONTEXT *)xd->left_context;
+ } else {
+ vpx_memcpy(&t_above, xd->above_context,
+ sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context,
+ sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ }
+
+ xd->mode_info_context->mbmi.mode = B_PRED;
+ bmode_costs = mb->inter_bmode_costs;
+
+ for (i = 0; i < 16; i++) {
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
+#endif
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+ if (xd->frame_type == KEY_FRAME) {
+ const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+ bmode_costs = mb->bmode_costs[A][L];
+ }
+
+ total_rd += rd_pick_intra4x4block(
+ cpi, mb, mb->block + i, xd->block + i, &best_mode,
+#if CONFIG_COMP_INTRA_PRED
+ & best_second_mode, allow_comp,
+#endif
+ bmode_costs, ta + vp9_block2above[i],
+ tl + vp9_block2left[i], &r, &ry, &d);
+
+ cost += r;
+ distortion += d;
+ tot_rate_y += ry;
+
+ mic->bmi[i].as_mode.first = best_mode;
+#if CONFIG_COMP_INTRA_PRED
+ mic->bmi[i].as_mode.second = best_second_mode;
+#endif
+
+ if (total_rd >= best_rd)
+ break;
+ }
+
+ if (total_rd >= best_rd)
+ return INT64_MAX;
+
+#if CONFIG_COMP_INTRA_PRED
+ cost += vp9_cost_bit(128, allow_comp);
+#endif
+ *Rate = cost;
+ *rate_y += tot_rate_y;
+ *Distortion = distortion;
+
+ return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ int *rate,
+ int *rate_tokenonly,
+ int *distortion,
+ int *skippable) {
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int this_rate, this_rate_tokenonly;
+ int this_distortion, s;
+ int64_t best_rd = INT64_MAX, this_rd;
+
+ /* Y Search for 32x32 intra prediction mode */
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ x->e_mbd.mode_info_context->mbmi.mode = mode;
+ vp9_build_intra_predictors_sby_s(&x->e_mbd);
+
+ super_block_yrd_8x8(x, &this_rate_tokenonly,
+ &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+ this_rate = this_rate_tokenonly +
+ x->mbmode_cost[x->e_mbd.frame_type]
+ [x->e_mbd.mode_info_context->mbmi.mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+ if (this_rd < best_rd) {
+ mode_selected = mode;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ }
+ }
+
+ x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+ return best_rd;
+}
+#endif
+
+static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ int *Rate,
+ int *rate_y,
+ int *Distortion,
+ int *skippable,
+ int64_t txfm_cache[NB_TXFM_MODES]) {
+ MB_PREDICTION_MODE mode;
+ TX_SIZE txfm_size;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+#if CONFIG_COMP_INTRA_PRED
+ MB_PREDICTION_MODE mode2;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
+#endif
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ int rate, ratey;
+ int distortion, skip;
+ int64_t best_rd = INT64_MAX;
+ int64_t this_rd;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int i;
+ for (i = 0; i < NB_TXFM_MODES; i++)
+ txfm_cache[i] = INT64_MAX;
+
+ // Y Search for 16x16 intra prediction mode
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ int64_t local_txfm_cache[NB_TXFM_MODES];
+
+ mbmi->mode = mode;
+
+#if CONFIG_COMP_INTRA_PRED
+ for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+ mbmi->second_mode = mode2;
+ if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+ vp9_build_intra_predictors_mby(&x->e_mbd);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ continue; // i.e. disable for now
+ vp9_build_comp_intra_predictors_mby(&x->e_mbd);
+ }
+#endif
+
+ macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
+
+ // FIXME add compoundmode cost
+ // FIXME add rate for mode2
+ rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+
+ if (this_rd < best_rd) {
+ mode_selected = mode;
+ txfm_size = mbmi->txfm_size;
+#if CONFIG_COMP_INTRA_PRED
+ mode2_selected = mode2;
+#endif
+ best_rd = this_rd;
+ *Rate = rate;
+ *rate_y = ratey;
+ *Distortion = distortion;
+ *skippable = skip;
+ }
+
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ int64_t adj_rd = this_rd + local_txfm_cache[i] -
+ local_txfm_cache[cpi->common.txfm_mode];
+ if (adj_rd < txfm_cache[i]) {
+ txfm_cache[i] = adj_rd;
+ }
+ }
+
+#if CONFIG_COMP_INTRA_PRED
+ }
+#endif
+ }
+
+ mbmi->txfm_size = txfm_size;
+ mbmi->mode = mode_selected;
+
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_mode = mode2_selected;
+#endif
+ return best_rd;
+}
+
+
+static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+ B_PREDICTION_MODE *best_mode,
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE *best_second_mode,
+#endif
+ int *mode_costs,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+ int *bestrate, int *bestratey,
+ int *bestdistortion) {
+ MB_PREDICTION_MODE mode;
+#if CONFIG_COMP_INTRA_PRED
+ MB_PREDICTION_MODE mode2;
+#endif
+ MACROBLOCKD *xd = &x->e_mbd;
+ int64_t best_rd = INT64_MAX;
+ int distortion, rate = 0;
+ BLOCK *be = x->block + ib;
+ BLOCKD *b = xd->block + ib;
+ ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
+ ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
+
+ /*
+ * The predictor buffer is a 2d buffer with a stride of 16. Create
+ * a temp buffer that meets the stride requirements, but we are only
+ * interested in the left 8x8 block
+ * */
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16 * 8);
+ DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
+
+ // perform transformation of dimension 8x8
+ // note the input and output index mapping
+ int idx = (ib & 0x02) ? (ib + 2) : ib;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+ for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+#endif
+ int64_t this_rd;
+ int rate_t;
+
+ // FIXME rate for compound mode and second intrapred mode
+ rate = mode_costs[mode];
+ b->bmi.as_mode.first = mode;
+
+#if CONFIG_COMP_INTRA_PRED
+ if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+ vp9_intra8x8_predict(b, mode, b->predictor);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ continue; // i.e. disable for now
+ vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);
+ }
+#endif
+
+ vp9_subtract_4b_c(be, b, 16);
+
+ if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+ TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+ if (tx_type != DCT_DCT)
+ vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
+ else
+ x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+ x->quantize_b_8x8(x->block + idx, xd->block + idx);
+
+ // compute quantization mse of 8x8 block
+ distortion = vp9_block_error_c((x->block + idx)->coeff,
+ (xd->block + idx)->dqcoeff, 64);
+ ta0 = a[vp9_block2above_8x8[idx]];
+ tl0 = l[vp9_block2left_8x8[idx]];
+
+ rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
+ &ta0, &tl0, TX_8X8);
+
+ rate += rate_t;
+ ta1 = ta0;
+ tl1 = tl0;
+ } else {
+ x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+ x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);
+
+ x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,
+ xd->block + ib, xd->block + ib + 1);
+ x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,
+ xd->block + ib + 4, xd->block + ib + 5);
+
+ distortion = vp9_block_error_c((x->block + ib)->coeff,
+ (xd->block + ib)->dqcoeff, 16);
+ distortion += vp9_block_error_c((x->block + ib + 1)->coeff,
+ (xd->block + ib + 1)->dqcoeff, 16);
+ distortion += vp9_block_error_c((x->block + ib + 4)->coeff,
+ (xd->block + ib + 4)->dqcoeff, 16);
+ distortion += vp9_block_error_c((x->block + ib + 5)->coeff,
+ (xd->block + ib + 5)->dqcoeff, 16);
+
+ ta0 = a[vp9_block2above[ib]];
+ ta1 = a[vp9_block2above[ib + 1]];
+ tl0 = l[vp9_block2left[ib]];
+ tl1 = l[vp9_block2left[ib + 4]];
+ rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,
+ &ta0, &tl0, TX_4X4);
+ rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,
+ &ta1, &tl0, TX_4X4);
+ rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,
+ &ta0, &tl1, TX_4X4);
+ rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,
+ &ta1, &tl1, TX_4X4);
+ rate += rate_t;
+ }
+
+ distortion >>= 2;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = rate_t;
+ *bestdistortion = distortion;
+ besta0 = ta0;
+ besta1 = ta1;
+ bestl0 = tl0;
+ bestl1 = tl1;
+ best_rd = this_rd;
+ *best_mode = mode;
+#if CONFIG_COMP_INTRA_PRED
+ *best_second_mode = mode2;
+#endif
+ copy_predictor_8x8(best_predictor, b->predictor);
+ vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
+ vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
+#if CONFIG_COMP_INTRA_PRED
+ }
+#endif
+ }
+ }
+ b->bmi.as_mode.first = (*best_mode);
+#if CONFIG_COMP_INTRA_PRED
+ b->bmi.as_mode.second = (*best_second_mode);
+#endif
+ vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
+
+ if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+ a[vp9_block2above_8x8[idx]] = besta0;
+ a[vp9_block2above_8x8[idx] + 1] = besta1;
+ l[vp9_block2left_8x8[idx]] = bestl0;
+ l[vp9_block2left_8x8[idx] + 1] = bestl1;
+ } else {
+ a[vp9_block2above[ib]] = besta0;
+ a[vp9_block2above[ib + 1]] = besta1;
+ l[vp9_block2left[ib]] = bestl0;
+ l[vp9_block2left[ib + 4]] = bestl1;
+ }
+
+ return best_rd;
+}
+
+static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
+ int *Rate, int *rate_y,
+ int *Distortion, int64_t best_rd) {
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int i, ib;
+ int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
+ int distortion = 0;
+ int tot_rate_y = 0;
+ long long total_rd = 0;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+ int *i8x8mode_costs;
+
+ vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ xd->mode_info_context->mbmi.mode = I8X8_PRED;
+ i8x8mode_costs = mb->i8x8_mode_costs;
+
+ for (i = 0; i < 4; i++) {
+ MODE_INFO *const mic = xd->mode_info_context;
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+#if CONFIG_COMP_INTRA_PRED
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
+#endif
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+ ib = vp9_i8x8_block[i];
+ total_rd += rd_pick_intra8x8block(
+ cpi, mb, ib, &best_mode,
+#if CONFIG_COMP_INTRA_PRED
+ & best_second_mode,
+#endif
+ i8x8mode_costs, ta, tl, &r, &ry, &d);
+ cost += r;
+ distortion += d;
+ tot_rate_y += ry;
+ mic->bmi[ib].as_mode.first = best_mode;
+#if CONFIG_COMP_INTRA_PRED
+ mic->bmi[ib].as_mode.second = best_second_mode;
+#endif
+ }
+ *Rate = cost;
+ *rate_y += tot_rate_y;
+ *Distortion = distortion;
+ return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+static int rd_cost_mbuv(MACROBLOCK *mb) {
+ int b;
+ int cost = 0;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+
+ vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 16; b < 24; b++)
+ cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+ ta + vp9_block2above[b], tl + vp9_block2left[b],
+ TX_4X4);
+
+ return cost;
+}
+
+
+static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int fullpixel, int *skip) {
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+
+ vp9_transform_mbuv_4x4(x);
+ vp9_quantize_mbuv_4x4(x);
+
+ *rate = rd_cost_mbuv(x);
+ *distortion = vp9_mbuverror(x) / 4;
+ *skip = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
+ int b;
+ int cost = 0;
+ MACROBLOCKD *xd = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+
+ if (backup) {
+ vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ } else {
+ ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+ tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+ }
+
+ for (b = 16; b < 24; b += 4)
+ cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
+ ta + vp9_block2above_8x8[b],
+ tl + vp9_block2left_8x8[b], TX_8X8);
+
+ return cost;
+}
+
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int fullpixel, int *skip) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int n, r = 0, d = 0;
+ const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+ int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+ int skippable = 1;
+ ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+ ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+ ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+ memcpy(t_above, xd->above_context, sizeof(t_above));
+ memcpy(t_left, xd->left_context, sizeof(t_left));
+
+ for (n = 0; n < 4; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+
+ vp9_subtract_mbuv_s_c(x->src_diff,
+ usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ src_uv_stride,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ dst_uv_stride);
+
+ vp9_transform_mbuv_8x8(x);
+ vp9_quantize_mbuv_8x8(x);
+
+ xd->above_context = ta + x_idx;
+ xd->left_context = tl + y_idx;
+ r += rd_cost_mbuv_8x8(x, 0);
+ d += vp9_mbuverror(x) / 4;
+ skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
+ }
+
+ *rate = r;
+ *distortion = d;
+ if (skip) *skip = skippable;
+ xd->left_context = tl;
+ xd->above_context = ta;
+ memcpy(xd->above_context, t_above, sizeof(t_above));
+ memcpy(xd->left_context, t_left, sizeof(t_left));
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
+
+static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int fullpixel, int *skip) {
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+
+ vp9_transform_mbuv_8x8(x);
+ vp9_quantize_mbuv_8x8(x);
+
+ *rate = rd_cost_mbuv_8x8(x, 1);
+ *distortion = vp9_mbuverror(x) / 4;
+ *skip = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+
+static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int *skippable, int fullpixel) {
+ vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+
+ vp9_transform_mbuv_4x4(x);
+ vp9_quantize_mbuv_4x4(x);
+
+ *rate = rd_cost_mbuv(x);
+ *distortion = vp9_mbuverror(x) / 4;
+ *skippable = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ int *rate,
+ int *rate_tokenonly,
+ int *distortion,
+ int *skippable) {
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+#if CONFIG_COMP_INTRA_PRED
+ MB_PREDICTION_MODE mode2;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
+#endif
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ int64_t best_rd = INT64_MAX;
+ int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+ int rate_to, UNINITIALIZED_IS_SAFE(skip);
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+#if CONFIG_COMP_INTRA_PRED
+ for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
+#endif
+ int rate;
+ int distortion;
+ int64_t this_rd;
+
+ mbmi->uv_mode = mode;
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_uv_mode = mode2;
+ if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
+#endif
+ vp9_build_intra_predictors_mbuv(&x->e_mbd);
+#if CONFIG_COMP_INTRA_PRED
+ } else {
+ continue;
+ vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);
+ }
+#endif
+
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+ vp9_transform_mbuv_4x4(x);
+ vp9_quantize_mbuv_4x4(x);
+
+ rate_to = rd_cost_mbuv(x);
+ rate = rate_to
+ + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
+
+ distortion = vp9_mbuverror(x) / 4;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ skip = vp9_mbuv_is_skippable_4x4(xd);
+ best_rd = this_rd;
+ d = distortion;
+ r = rate;
+ *rate_tokenonly = rate_to;
+ mode_selected = mode;
+#if CONFIG_COMP_INTRA_PRED
+ mode2_selected = mode2;
+ }
+#endif
+ }
+ }
+
+ *rate = r;
+ *distortion = d;
+ *skippable = skip;
+
+ mbmi->uv_mode = mode_selected;
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_uv_mode = mode2_selected;
+#endif
+}
+
+static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ int *rate,
+ int *rate_tokenonly,
+ int *distortion,
+ int *skippable) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ int64_t best_rd = INT64_MAX;
+ int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+ int rate_to, UNINITIALIZED_IS_SAFE(skip);
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ int rate;
+ int distortion;
+ int64_t this_rd;
+
+ mbmi->uv_mode = mode;
+ vp9_build_intra_predictors_mbuv(&x->e_mbd);
+ vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+ x->e_mbd.predictor, x->src.uv_stride);
+ vp9_transform_mbuv_8x8(x);
+
+ vp9_quantize_mbuv_8x8(x);
+
+ rate_to = rd_cost_mbuv_8x8(x, 1);
+ rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
+
+ distortion = vp9_mbuverror(x) / 4;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ skip = vp9_mbuv_is_skippable_8x8(xd);
+ best_rd = this_rd;
+ d = distortion;
+ r = rate;
+ *rate_tokenonly = rate_to;
+ mode_selected = mode;
+ }
+ }
+ *rate = r;
+ *distortion = d;
+ *skippable = skip;
+ mbmi->uv_mode = mode_selected;
+}
+
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+ int *rate,
+ int *distortion,
+ const VP9_ENCODER_RTCD *rtcd,
+ int *skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int d = 0, r = 0, n, s = 1;
+ const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+ int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+ ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+ ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+ ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+ memcpy(t_above, xd->above_context, sizeof(t_above));
+ memcpy(t_left, xd->left_context, sizeof(t_left));
+
+ for (n = 0; n < 4; n++) {
+ int x_idx = n & 1, y_idx = n >> 1;
+
+ vp9_subtract_mbuv_s_c(x->src_diff,
+ usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+ src_uv_stride,
+ udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+ dst_uv_stride);
+ vp9_transform_mbuv_8x8(x);
+ vp9_quantize_mbuv_8x8(x);
+ s &= vp9_mbuv_is_skippable_8x8(xd);
+
+ d += vp9_mbuverror(x) >> 2;
+ xd->above_context = ta + x_idx;
+ xd->left_context = tl + y_idx;
+ r += rd_cost_mbuv_8x8(x, 0);
+ }
+
+ xd->above_context = ta;
+ xd->left_context = tl;
+ *distortion = d;
+ *rate = r;
+ *skippable = s;
+
+ xd->left_context = tl;
+ xd->above_context = ta;
+ memcpy(xd->above_context, t_above, sizeof(t_above));
+ memcpy(xd->left_context, t_left, sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
+ MACROBLOCK *x,
+ int *rate,
+ int *rate_tokenonly,
+ int *distortion,
+ int *skippable) {
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int64_t best_rd = INT64_MAX, this_rd;
+ int this_rate_tokenonly, this_rate;
+ int this_distortion, s;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+ vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+
+ super_block_uvrd_8x8(x, &this_rate_tokenonly,
+ &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+ this_rate = this_rate_tokenonly +
+ x->mbmode_cost[x->e_mbd.frame_type]
+ [x->e_mbd.mode_info_context->mbmi.mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+ if (this_rd < best_rd) {
+ mode_selected = mode;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ }
+ }
+
+ x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+ return best_rd;
+}
+#endif
+
+int vp9_cost_mv_ref(VP9_COMP *cpi,
+ MB_PREDICTION_MODE m,
+ const int near_mv_ref_ct[4]) {
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+ // If the mode coding is done entirely at the segment level
+ // we should not account for it at the per mb level in rd code.
+ // Note that if the segment level coding is expanded from single mode
+ // to multiple mode masks as per reference frame coding we will need
+ // to do something different here.
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+ VP9_COMMON *pc = &cpi->common;
+
+ vp9_prob p [VP9_MVREFS - 1];
+ assert(NEARESTMV <= m && m <= SPLITMV);
+ vp9_mv_ref_probs(pc, p, near_mv_ref_ct);
+ return cost_token(vp9_mv_ref_tree, p,
+ vp9_mv_ref_encoding_array - NEARESTMV + m);
+ } else
+ return 0;
+}
+
+void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
+ x->e_mbd.mode_info_context->mbmi.mode = mb;
+ x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
+}
+
+static int labels2mode(
+ MACROBLOCK *x,
+ int const *labelings, int which_label,
+ B_PREDICTION_MODE this_mode,
+ int_mv *this_mv, int_mv *this_second_mv,
+ int_mv seg_mvs[MAX_REF_FRAMES - 1],
+ int_mv *best_ref_mv,
+ int_mv *second_best_ref_mv,
+ DEC_MVCOSTS) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mode_info_context;
+ MB_MODE_INFO * mbmi = &mic->mbmi;
+ const int mis = xd->mode_info_stride;
+
+ int i, cost = 0, thismvcost = 0;
+
+ /* We have to be careful retrieving previously-encoded motion vectors.
+ Ones from this macroblock have to be pulled from the BLOCKD array
+ as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+ for (i = 0; i < 16; ++i) {
+ BLOCKD *const d = xd->block + i;
+ const int row = i >> 2, col = i & 3;
+
+ B_PREDICTION_MODE m;
+
+ if (labelings[i] != which_label)
+ continue;
+
+ if (col && labelings[i] == labelings[i - 1])
+ m = LEFT4X4;
+ else if (row && labelings[i] == labelings[i - 4])
+ m = ABOVE4X4;
+ else {
+ // the only time we should do costing for new motion vector or mode
+ // is when we are on a new label (jbb May 08, 2007)
+ switch (m = this_mode) {
+ case NEW4X4 :
+ if (mbmi->second_ref_frame) {
+ this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
+ this_second_mv->as_int =
+ seg_mvs[mbmi->second_ref_frame - 1].as_int;
+ }
+
+ thismvcost = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,
+ 102, xd->allow_high_precision_mv);
+ if (mbmi->second_ref_frame) {
+ thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+ MVCOSTS, 102,
+ xd->allow_high_precision_mv);
+ }
+ break;
+ case LEFT4X4:
+ this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);
+ if (mbmi->second_ref_frame)
+ this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);
+ break;
+ case ABOVE4X4:
+ this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);
+ if (mbmi->second_ref_frame)
+ this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);
+ break;
+ case ZERO4X4:
+ this_mv->as_int = 0;
+ if (mbmi->second_ref_frame)
+ this_second_mv->as_int = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (m == ABOVE4X4) { // replace above with left if same
+ int_mv left_mv, left_second_mv;
+
+ left_second_mv.as_int = 0;
+ left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :
+ left_block_mv(mic, i);
+ if (mbmi->second_ref_frame)
+ left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :
+ left_block_second_mv(mic, i);
+
+ if (left_mv.as_int == this_mv->as_int &&
+ (!mbmi->second_ref_frame ||
+ left_second_mv.as_int == this_second_mv->as_int))
+ m = LEFT4X4;
+ }
+
+ cost = x->inter_bmode_costs[ m];
+ }
+
+ d->bmi.as_mv.first.as_int = this_mv->as_int;
+ if (mbmi->second_ref_frame)
+ d->bmi.as_mv.second.as_int = this_second_mv->as_int;
+
+ x->partition_info->bmi[i].mode = m;
+ x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+ if (mbmi->second_ref_frame)
+ x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+ }
+
+ cost += thismvcost;
+ return cost;
+}
+
+static int64_t encode_inter_mb_segment(MACROBLOCK *x,
+ int const *labels,
+ int which_label,
+ int *labelyrate,
+ int *distortion,
+ ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl,
+ const VP9_ENCODER_RTCD *rtcd) {
+ int i;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ *labelyrate = 0;
+ *distortion = 0;
+ for (i = 0; i < 16; i++) {
+ if (labels[i] == which_label) {
+ BLOCKD *bd = &x->e_mbd.block[i];
+ BLOCK *be = &x->block[i];
+ int thisdistortion;
+
+ vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);
+ if (xd->mode_info_context->mbmi.second_ref_frame)
+ vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
+ vp9_subtract_b(be, bd, 16);
+ x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b_4x4(be, bd);
+ thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
+ *distortion += thisdistortion;
+ *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+ ta + vp9_block2above[i],
+ tl + vp9_block2left[i], TX_4X4);
+ }
+ }
+ *distortion >>= 2;
+ return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
+ int const *labels,
+ int which_label,
+ int *labelyrate,
+ int *distortion,
+ int64_t *otherrd,
+ ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl,
+ const VP9_ENCODER_RTCD *rtcd) {
+ int i, j;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int iblock[4] = { 0, 1, 4, 5 };
+ int othercost = 0, otherdist = 0;
+ ENTROPY_CONTEXT_PLANES tac, tlc;
+ ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
+ *tlcp = (ENTROPY_CONTEXT *) &tlc;
+
+ if (otherrd) {
+ memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
+ memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
+ }
+
+ *distortion = 0;
+ *labelyrate = 0;
+ for (i = 0; i < 4; i++) {
+ int ib = vp9_i8x8_block[i];
+
+ if (labels[ib] == which_label) {
+ int idx = (ib & 8) + ((ib & 2) << 1);
+ BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
+ BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
+ int thisdistortion;
+
+ vp9_build_inter_predictors4b(xd, bd, 16);
+ if (xd->mode_info_context->mbmi.second_ref_frame)
+ vp9_build_2nd_inter_predictors4b(xd, bd, 16);
+ vp9_subtract_4b_c(be, bd, 16);
+
+ if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
+ if (otherrd) {
+ x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+ x->quantize_b_8x8(be2, bd2);
+ thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+ otherdist += thisdistortion;
+ othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+ tacp + vp9_block2above_8x8[idx],
+ tlcp + vp9_block2left_8x8[idx], TX_8X8);
+ }
+ for (j = 0; j < 4; j += 2) {
+ bd = &xd->block[ib + iblock[j]];
+ be = &x->block[ib + iblock[j]];
+ x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);
+ x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);
+ thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
+ *distortion += thisdistortion;
+ *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
+ ta + vp9_block2above[ib + iblock[j]],
+ tl + vp9_block2left[ib + iblock[j]],
+ TX_4X4);
+ *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
+ ta + vp9_block2above[ib + iblock[j] + 1],
+ tl + vp9_block2left[ib + iblock[j]],
+ TX_4X4);
+ }
+ } else /* 8x8 */ {
+ if (otherrd) {
+ for (j = 0; j < 4; j += 2) {
+ BLOCKD *bd3 = &xd->block[ib + iblock[j]];
+ BLOCK *be3 = &x->block[ib + iblock[j]];
+ x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);
+ x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);
+ thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);
+ otherdist += thisdistortion;
+ othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,
+ tacp + vp9_block2above[ib + iblock[j]],
+ tlcp + vp9_block2left[ib + iblock[j]],
+ TX_4X4);
+ othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,
+ tacp + vp9_block2above[ib + iblock[j] + 1],
+ tlcp + vp9_block2left[ib + iblock[j]],
+ TX_4X4);
+ }
+ }
+ x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
+ x->quantize_b_8x8(be2, bd2);
+ thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
+ *distortion += thisdistortion;
+ *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
+ ta + vp9_block2above_8x8[idx],
+ tl + vp9_block2left_8x8[idx], TX_8X8);
+ }
+ }
+ }
+ *distortion >>= 2;
+ if (otherrd) {
+ otherdist >>= 2;
+ *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
+ }
+ return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
+
+typedef struct {
+ int_mv *ref_mv, *second_ref_mv;
+ int_mv mvp;
+
+ int64_t segment_rd;
+ SPLITMV_PARTITIONING_TYPE segment_num;
+ TX_SIZE txfm_size;
+ int r;
+ int d;
+ int segment_yrate;
+ B_PREDICTION_MODE modes[16];
+ int_mv mvs[16], second_mvs[16];
+ int eobs[16];
+
+ int mvthresh;
+ int *mdcounts;
+
+ int_mv sv_mvp[4]; // save 4 mvp from 8x8
+ int sv_istep[2]; // save 2 initial step_param for 16x8/8x16
+
+} BEST_SEG_INFO;
+
+static __inline
+int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
+ int r = 0;
+ r |= (mv->as_mv.row >> 3) < x->mv_row_min;
+ r |= (mv->as_mv.row >> 3) > x->mv_row_max;
+ r |= (mv->as_mv.col >> 3) < x->mv_col_min;
+ r |= (mv->as_mv.col >> 3) > x->mv_col_max;
+ return r;
+}
+
+static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+ BEST_SEG_INFO *bsi,
+ SPLITMV_PARTITIONING_TYPE segmentation,
+ TX_SIZE tx_size, int64_t *otherrds,
+ int64_t *rds, int *completed,
+ /* 16 = n_blocks */
+ int_mv seg_mvs[16 /* n_blocks */]
+ [MAX_REF_FRAMES - 1]) {
+ int i, j;
+ int const *labels;
+ int br = 0, bd = 0;
+ B_PREDICTION_MODE this_mode;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+ int label_count;
+ int64_t this_segment_rd = 0, other_segment_rd;
+ int label_mv_thresh;
+ int rate = 0;
+ int sbr = 0, sbd = 0;
+ int segmentyrate = 0;
+ int best_eobs[16] = { 0 };
+
+ vp9_variance_fn_ptr_t *v_fn_ptr;
+
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta, *tl;
+ ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+ ENTROPY_CONTEXT *ta_b, *tl_b;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+ tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+
+ v_fn_ptr = &cpi->fn_ptr[segmentation];
+ labels = vp9_mbsplits[segmentation];
+ label_count = vp9_mbsplit_count[segmentation];
+
+ // 64 makes this threshold really big effectively
+ // making it so that we very rarely check mvs on
+ // segments. setting this to 1 would make mv thresh
+ // roughly equal to what it is for macroblocks
+ label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+ // Segmentation method overheads
+ rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
+ vp9_mbsplit_encodings + segmentation);
+ rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);
+ this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+ br += rate;
+ other_segment_rd = this_segment_rd;
+
+ mbmi->txfm_size = tx_size;
+ for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
+ int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
+ int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+ B_PREDICTION_MODE mode_selected = ZERO4X4;
+ int bestlabelyrate = 0;
+
+ // search for the best motion vector on this segment
+ for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
+ int64_t this_rd, other_rd;
+ int distortion;
+ int labelyrate;
+ ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
+ ENTROPY_CONTEXT *ta_s;
+ ENTROPY_CONTEXT *tl_s;
+
+ vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta_s = (ENTROPY_CONTEXT *)&t_above_s;
+ tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+
+ // motion search for newmv (single predictor case only)
+ if (!mbmi->second_ref_frame && this_mode == NEW4X4) {
+ int sseshift, n;
+ int step_param = 0;
+ int further_steps;
+ int thissme, bestsme = INT_MAX;
+ BLOCK *c;
+ BLOCKD *e;
+
+ /* Is the best so far sufficiently good that we cant justify doing
+ * and new motion search. */
+ if (best_label_rd < label_mv_thresh)
+ break;
+
+ if (cpi->compressor_speed) {
+ if (segmentation == PARTITIONING_8X16 ||
+ segmentation == PARTITIONING_16X8) {
+ bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
+ if (i == 1 && segmentation == PARTITIONING_16X8)
+ bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+
+ step_param = bsi->sv_istep[i];
+ }
+
+ // use previous block's result as next block's MV predictor.
+ if (segmentation == PARTITIONING_4X4 && i > 0) {
+ bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;
+ if (i == 4 || i == 8 || i == 12)
+ bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;
+ step_param = 2;
+ }
+ }
+
+ further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+ {
+ int sadpb = x->sadperbit4;
+ int_mv mvp_full;
+
+ mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+ mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+ // find first label
+ n = vp9_mbsplit_offset[segmentation][i];
+
+ c = &x->block[n];
+ e = &x->e_mbd.block[n];
+
+ bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,
+ sadpb, further_steps, 0, v_fn_ptr,
+ bsi->ref_mv, &mode_mv[NEW4X4]);
+
+ sseshift = segmentation_to_sseshift[segmentation];
+
+ // Should we do a full search (best quality only)
+ if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+ /* Check if mvp_full is within the range. */
+ clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+ x->mv_row_min, x->mv_row_max);
+
+ thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+ sadpb, 16, v_fn_ptr,
+ XMVCOST, bsi->ref_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;
+ } else {
+ /* The full search result is actually worse so re-instate the
+ * previous best vector */
+ e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;
+ }
+ }
+ }
+
+ if (bestsme < INT_MAX) {
+ int distortion;
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
+ bsi->ref_mv, x->errorperbit, v_fn_ptr,
+ XMVCOST, &distortion, &sse);
+
+ // safe motion search result for use in compound prediction
+ seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+ }
+ } /* NEW4X4 */
+ else if (mbmi->second_ref_frame && this_mode == NEW4X4) {
+ /* motion search not completed? Then skip newmv for this block with
+ * comppred */
+ if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
+ seg_mvs[i][mbmi->ref_frame - 1].as_int == INVALID_MV) {
+ continue;
+ }
+ }
+
+ rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+ &second_mode_mv[this_mode], seg_mvs[i],
+ bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
+
+ // Trap vectors that reach beyond the UMV borders
+ if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+ ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+ continue;
+ }
+ if (mbmi->second_ref_frame &&
+ mv_check_bounds(x, &second_mode_mv[this_mode]))
+ continue;
+
+ if (segmentation == PARTITIONING_4X4) {
+ this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
+ &distortion,
+ ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+ other_rd = this_rd;
+ } else {
+ this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
+ &distortion, &other_rd,
+ ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+ }
+ this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+ rate += labelyrate;
+
+ if (this_rd < best_label_rd) {
+ sbr = rate;
+ sbd = distortion;
+ bestlabelyrate = labelyrate;
+ mode_selected = this_mode;
+ best_label_rd = this_rd;
+ if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
+ for (j = 0; j < 16; j++)
+ if (labels[j] == i)
+ best_eobs[j] = x->e_mbd.block[j].eob;
+ } else {
+ for (j = 0; j < 4; j++) {
+ int ib = vp9_i8x8_block[j], idx = j * 4;
+
+ if (labels[ib] == i)
+ best_eobs[idx] = x->e_mbd.block[idx].eob;
+ }
+ }
+ if (other_rd < best_other_rd)
+ best_other_rd = other_rd;
+
+ vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ }
+ } /*for each 4x4 mode*/
+
+ vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+ &second_mode_mv[mode_selected], seg_mvs[i],
+ bsi->ref_mv, bsi->second_ref_mv, XMVCOST);
+
+ br += sbr;
+ bd += sbd;
+ segmentyrate += bestlabelyrate;
+ this_segment_rd += best_label_rd;
+ other_segment_rd += best_other_rd;
+ if (rds)
+ rds[i] = this_segment_rd;
+ if (otherrds)
+ otherrds[i] = other_segment_rd;
+ } /* for each label */
+
+ if (this_segment_rd < bsi->segment_rd) {
+ bsi->r = br;
+ bsi->d = bd;
+ bsi->segment_yrate = segmentyrate;
+ bsi->segment_rd = this_segment_rd;
+ bsi->segment_num = segmentation;
+ bsi->txfm_size = mbmi->txfm_size;
+
+ // store everything needed to come back to this!!
+ for (i = 0; i < 16; i++) {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+ if (mbmi->second_ref_frame)
+ bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
+ bsi->modes[i] = x->partition_info->bmi[i].mode;
+ bsi->eobs[i] = best_eobs[i];
+ }
+ }
+
+ if (completed) {
+ *completed = i;
+ }
+}
+
+static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
+ BEST_SEG_INFO *bsi,
+ unsigned int segmentation,
+ /* 16 = n_blocks */
+ int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
+ int64_t txfm_cache[NB_TXFM_MODES]) {
+ int i, n, c = vp9_mbsplit_count[segmentation];
+
+ if (segmentation == PARTITIONING_4X4) {
+ int64_t rd[16];
+
+ rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
+ rd, &n, seg_mvs);
+ if (n == c) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ if (rd[c - 1] < txfm_cache[i])
+ txfm_cache[i] = rd[c - 1];
+ }
+ }
+ } else {
+ int64_t diff, base_rd;
+ int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
+ int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
+
+ if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+ int64_t rd4x4[4], rd8x8[4];
+ int n4x4, n8x8, nmin;
+ BEST_SEG_INFO bsi4x4, bsi8x8;
+
+ /* factor in cost of cost4x4/8x8 in decision */
+ vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
+ vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
+ rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
+ TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
+ rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
+ TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
+ if (bsi4x4.segment_num == segmentation) {
+ bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+ if (bsi4x4.segment_rd < bsi->segment_rd)
+ vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
+ }
+ if (bsi8x8.segment_num == segmentation) {
+ bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+ if (bsi8x8.segment_rd < bsi->segment_rd)
+ vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
+ }
+ n = n4x4 > n8x8 ? n4x4 : n8x8;
+ if (n == c) {
+ nmin = n4x4 < n8x8 ? n4x4 : n8x8;
+ diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
+ if (n == n4x4) {
+ base_rd = rd4x4[c - 1];
+ } else {
+ base_rd = rd8x8[c - 1] - diff;
+ }
+ }
+ } else {
+ int64_t rd[4], otherrd[4];
+
+ if (cpi->common.txfm_mode == ONLY_4X4) {
+ rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
+ rd, &n, seg_mvs);
+ if (n == c) {
+ base_rd = rd[c - 1];
+ diff = otherrd[c - 1] - rd[c - 1];
+ }
+ } else /* use 8x8 transform */ {
+ rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
+ rd, &n, seg_mvs);
+ if (n == c) {
+ diff = rd[c - 1] - otherrd[c - 1];
+ base_rd = otherrd[c - 1];
+ }
+ }
+ }
+
+ if (n == c) {
+ if (base_rd < txfm_cache[ONLY_4X4]) {
+ txfm_cache[ONLY_4X4] = base_rd;
+ }
+ if (base_rd + diff < txfm_cache[1]) {
+ txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;
+ }
+ if (diff < 0) {
+ base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
+ } else {
+ base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
+ }
+ if (base_rd < txfm_cache[TX_MODE_SELECT]) {
+ txfm_cache[TX_MODE_SELECT] = base_rd;
+ }
+ }
+ }
+}
+
+static __inline void cal_step_param(int sr, int *sp) {
+ int step = 0;
+
+ if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
+ else if (sr < 1) sr = 1;
+
+ while (sr >>= 1)
+ step++;
+
+ *sp = MAX_MVSEARCH_STEPS - 1 - step;
+}
+
+static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+ int_mv *best_ref_mv,
+ int_mv *second_best_ref_mv,
+ int64_t best_rd,
+ int *mdcounts,
+ int *returntotrate,
+ int *returnyrate,
+ int *returndistortion,
+ int *skippable, int mvthresh,
+ int_mv seg_mvs[NB_PARTITIONINGS]
+ [16 /* n_blocks */]
+ [MAX_REF_FRAMES - 1],
+ int64_t txfm_cache[NB_TXFM_MODES]) {
+ int i;
+ BEST_SEG_INFO bsi;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+
+ vpx_memset(&bsi, 0, sizeof(bsi));
+ for (i = 0; i < NB_TXFM_MODES; i++)
+ txfm_cache[i] = INT64_MAX;
+
+ bsi.segment_rd = best_rd;
+ bsi.ref_mv = best_ref_mv;
+ bsi.second_ref_mv = second_best_ref_mv;
+ bsi.mvp.as_int = best_ref_mv->as_int;
+ bsi.mvthresh = mvthresh;
+ bsi.mdcounts = mdcounts;
+ bsi.txfm_size = TX_4X4;
+
+ for (i = 0; i < 16; i++)
+ bsi.modes[i] = ZERO4X4;
+
+ if (cpi->compressor_speed == 0) {
+ /* for now, we will keep the original segmentation order
+ when in best quality mode */
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+ seg_mvs[PARTITIONING_16X8], txfm_cache);
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+ seg_mvs[PARTITIONING_8X16], txfm_cache);
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+ seg_mvs[PARTITIONING_8X8], txfm_cache);
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+ seg_mvs[PARTITIONING_4X4], txfm_cache);
+ } else {
+ int sr;
+
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
+ seg_mvs[PARTITIONING_8X8], txfm_cache);
+
+ if (bsi.segment_rd < best_rd) {
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+
+ vp9_clamp_mv_min_max(x, best_ref_mv);
+
+ /* Get 8x8 result */
+ bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
+ bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
+ bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
+ bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
+
+ /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
+ * according to the closeness of 2 MV. */
+ /* block 8X16 */
+ sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
+ (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
+ cal_step_param(sr, &bsi.sv_istep[0]);
+
+ sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+ (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+ cal_step_param(sr, &bsi.sv_istep[1]);
+
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
+ seg_mvs[PARTITIONING_8X16], txfm_cache);
+
+ /* block 16X8 */
+ sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
+ (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
+ cal_step_param(sr, &bsi.sv_istep[0]);
+
+ sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
+ (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+ cal_step_param(sr, &bsi.sv_istep[1]);
+
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
+ seg_mvs[PARTITIONING_16X8], txfm_cache);
+
+ /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
+ /* Not skip 4x4 if speed=0 (good quality) */
+ if (cpi->sf.no_skip_block4x4_search ||
+ bsi.segment_num == PARTITIONING_8X8) {
+ /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+ bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
+ rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
+ seg_mvs[PARTITIONING_4X4], txfm_cache);
+ }
+
+ /* restore UMV window */
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+ }
+ }
+
+ /* set it to the best */
+ for (i = 0; i < 16; i++) {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;
+ if (mbmi->second_ref_frame)
+ bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;
+ bd->eob = bsi.eobs[i];
+ }
+
+ *returntotrate = bsi.r;
+ *returndistortion = bsi.d;
+ *returnyrate = bsi.segment_yrate;
+ *skippable = bsi.txfm_size == TX_4X4 ?
+ vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :
+ vp9_mby_is_skippable_8x8(&x->e_mbd, 0);
+
+ /* save partitions */
+ mbmi->txfm_size = bsi.txfm_size;
+ mbmi->partitioning = bsi.segment_num;
+ x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
+
+ for (i = 0; i < x->partition_info->count; i++) {
+ int j;
+
+ j = vp9_mbsplit_offset[bsi.segment_num][i];
+
+ x->partition_info->bmi[i].mode = bsi.modes[j];
+ x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+ if (mbmi->second_ref_frame)
+ x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
+ }
+ /*
+ * used to set mbmi->mv.as_int
+ */
+ x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+ if (mbmi->second_ref_frame)
+ x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
+
+ return bsi.segment_rd;
+}
+
+/* Order arr in increasing order, original position stored in idx */
+static void insertsortmv(int arr[], int len) {
+ int i, j, k;
+
+ for (i = 1; i <= len - 1; i++) {
+ for (j = 0; j < i; j++) {
+ if (arr[j] > arr[i]) {
+ int temp;
+
+ temp = arr[i];
+
+ for (k = i; k > j; k--)
+ arr[k] = arr[k - 1];
+
+ arr[j] = temp;
+ }
+ }
+ }
+}
+
+static void insertsortsad(int arr[], int idx[], int len) {
+ int i, j, k;
+
+ for (i = 1; i <= len - 1; i++) {
+ for (j = 0; j < i; j++) {
+ if (arr[j] > arr[i]) {
+ int temp, tempi;
+
+ temp = arr[i];
+ tempi = idx[i];
+
+ for (k = i; k > j; k--) {
+ arr[k] = arr[k - 1];
+ idx[k] = idx[k - 1];
+ }
+
+ arr[j] = temp;
+ idx[j] = tempi;
+ }
+ }
+ }
+}
+
+// The improved MV prediction
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
+ int_mv *mvp, int refframe, int *ref_frame_sign_bias,
+ int *sr, int near_sadidx[]) {
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int_mv near_mvs[8];
+ int near_ref[8];
+ int_mv mv;
+ int vcnt = 0;
+ int find = 0;
+ int mb_offset;
+
+ int mvx[8];
+ int mvy[8];
+ int i;
+
+ mv.as_int = 0;
+
+ if (here->mbmi.ref_frame != INTRA_FRAME) {
+ near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+ near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
+
+ // read in 3 nearby block's MVs from current frame as prediction candidates.
+ if (above->mbmi.ref_frame != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = above->mbmi.ref_frame;
+ }
+ vcnt++;
+ if (left->mbmi.ref_frame != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = left->mbmi.ref_frame;
+ }
+ vcnt++;
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = aboveleft->mbmi.ref_frame;
+ }
+ vcnt++;
+
+ // read in 5 nearby block's MVs from last frame.
+ if (cpi->common.last_frame_type != KEY_FRAME) {
+ mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);
+
+ // current in last frame
+ if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset];
+ }
+ vcnt++;
+
+ // above in last frame
+ if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];
+ }
+ vcnt++;
+
+ // left in last frame
+ if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - 1];
+ }
+ vcnt++;
+
+ // right in last frame
+ if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + 1];
+ }
+ vcnt++;
+
+ // below in last frame
+ if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];
+ }
+ vcnt++;
+ }
+
+ for (i = 0; i < vcnt; i++) {
+ if (near_ref[near_sadidx[i]] != INTRA_FRAME) {
+ if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {
+ mv.as_int = near_mvs[near_sadidx[i]].as_int;
+ find = 1;
+ if (i < 3)
+ *sr = 3;
+ else
+ *sr = 2;
+ break;
+ }
+ }
+ }
+
+ if (!find) {
+ for (i = 0; i < vcnt; i++) {
+ mvx[i] = near_mvs[i].as_mv.row;
+ mvy[i] = near_mvs[i].as_mv.col;
+ }
+
+ insertsortmv(mvx, vcnt);
+ insertsortmv(mvy, vcnt);
+ mv.as_mv.row = mvx[vcnt / 2];
+ mv.as_mv.col = mvy[vcnt / 2];
+
+ find = 1;
+ // sr is set to 0 to allow calling function to decide the search range.
+ *sr = 0;
+ }
+ }
+
+ /* Set up return values */
+ mvp->as_int = mv.as_int;
+ clamp_mv2(mvp, xd);
+}
+
+static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
+ int recon_yoffset, int near_sadidx[],
+ enum BlockSize block_size) {
+ /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,
+ * 5-lf left, 6-lf right, 7-lf below */
+ int near_sad[8] = {0};
+ BLOCK *b = &x->block[0];
+ unsigned char *src_y_ptr = *(b->base_src);
+ const unsigned char *dst_y_ptr = xd->dst.y_buffer;
+ const int bs = (block_size == BLOCK_16X16) ? 16 : 32;
+ const int dst_y_str = xd->dst.y_stride;
+
+ // calculate sad for current frame 3 nearby MBs.
+ if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {
+ near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+ } else if (xd->mb_to_top_edge == 0) {
+ // only has left MB for sad calculation.
+ near_sad[0] = near_sad[2] = INT_MAX;
+ near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ dst_y_ptr - bs,
+ dst_y_str, 0x7fffffff);
+ } else if (xd->mb_to_left_edge == 0) {
+ // only has left MB for sad calculation.
+ near_sad[1] = near_sad[2] = INT_MAX;
+ near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ dst_y_ptr - dst_y_str * bs,
+ dst_y_str, 0x7fffffff);
+ } else {
+ near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ dst_y_ptr - dst_y_str * bs,
+ dst_y_str, 0x7fffffff);
+ near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ dst_y_ptr - bs,
+ dst_y_str, 0x7fffffff);
+ near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ dst_y_ptr - dst_y_str * bs - bs,
+ dst_y_str, 0x7fffffff);
+ }
+
+ if (cpi->common.last_frame_type != KEY_FRAME) {
+ // calculate sad for last frame 5 nearby MBs.
+ unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+ const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+ if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;
+ if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;
+ if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;
+ if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;
+
+ near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ pre_y_buffer,
+ pre_y_str, 0x7fffffff);
+ if (near_sad[4] != INT_MAX)
+ near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ pre_y_buffer - pre_y_str * bs,
+ pre_y_str, 0x7fffffff);
+ if (near_sad[5] != INT_MAX)
+ near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ pre_y_buffer - bs,
+ pre_y_str, 0x7fffffff);
+ if (near_sad[6] != INT_MAX)
+ near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ pre_y_buffer + bs,
+ pre_y_str, 0x7fffffff);
+ if (near_sad[7] != INT_MAX)
+ near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+ pre_y_buffer + pre_y_str * bs,
+ pre_y_str, 0x7fffffff);
+ }
+
+ if (cpi->common.last_frame_type != KEY_FRAME) {
+ insertsortsad(near_sad, near_sadidx, 8);
+ } else {
+ insertsortsad(near_sad, near_sadidx, 3);
+ }
+}
+
+static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {
+ int i;
+ MACROBLOCKD *xd = &x->e_mbd;
+ for (i = 0; i < 4; i++) {
+ int ib = vp9_i8x8_block[i];
+ xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];
+ xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];
+ xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];
+ xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];
+#if CONFIG_COMP_INTRA_PRED
+ xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];
+ xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];
+ xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];
+ xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];
+#endif
+ // printf("%d,%d,%d,%d %d,%d,%d,%d\n",
+ // modes[0][0], modes[0][1], modes[0][2], modes[0][3],
+ // modes[1][0], modes[1][1], modes[1][2], modes[1][3]);
+ }
+
+ for (i = 0; i < 16; i++) {
+ xd->block[i].bmi = xd->mode_info_context->bmi[i];
+ }
+}
+
+extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
+static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
+ int norm_cnt[MAX_REF_FRAMES];
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ int intra_count = rfct[INTRA_FRAME];
+ int last_count = rfct[LAST_FRAME];
+ int gf_count = rfct[GOLDEN_FRAME];
+ int arf_count = rfct[ALTREF_FRAME];
+
+ // Work out modified reference frame probabilities to use where prediction
+ // of the reference frame fails
+ if (pred_ref == INTRA_FRAME) {
+ norm_cnt[0] = 0;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+ mod_refprobs[0] = 0; // This branch implicit
+ } else if (pred_ref == LAST_FRAME) {
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = 0;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+ mod_refprobs[1] = 0; // This branch implicit
+ } else if (pred_ref == GOLDEN_FRAME) {
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = 0;
+ norm_cnt[3] = arf_count;
+ vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+ mod_refprobs[2] = 0; // This branch implicit
+ } else {
+ norm_cnt[0] = intra_count;
+ norm_cnt[1] = last_count;
+ norm_cnt[2] = gf_count;
+ norm_cnt[3] = 0;
+ vp9_calc_ref_probs(norm_cnt, mod_refprobs);
+ mod_refprobs[2] = 0; // This branch implicit
+ }
+}
+
+static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {
+ unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
+ unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
+ // weight is 16-bit fixed point, so this basically calculates:
+ // 0.5 + weight * cost1 + (1.0 - weight) * cost0
+ return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
+}
+
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ vp9_prob *mod_refprobs;
+
+ unsigned int cost;
+ int pred_ref;
+ int pred_flag;
+ int pred_ctx;
+ int i;
+ int tot_count;
+
+ vp9_prob pred_prob, new_pred_prob;
+ int seg_ref_active;
+ int seg_ref_count = 0;
+ seg_ref_active = vp9_segfeature_active(xd,
+ segment_id,
+ SEG_LVL_REF_FRAME);
+
+ if (seg_ref_active) {
+ seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+ vp9_check_segref(xd, segment_id, LAST_FRAME) +
+ vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+ vp9_check_segref(xd, segment_id, ALTREF_FRAME);
+ }
+
+ // Get the predicted reference for this mb
+ pred_ref = vp9_get_pred_ref(cm, xd);
+
+ // Get the context probability for the prediction flag (based on last frame)
+ pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
+
+ // Predict probability for current frame based on stats so far
+ pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
+ tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];
+ if (tot_count) {
+ new_pred_prob =
+ (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;
+ new_pred_prob += !new_pred_prob;
+ } else
+ new_pred_prob = 128;
+
+ // Get the set of probabilities to use if prediction fails
+ mod_refprobs = cm->mod_refprobs[pred_ref];
+
+ // For each possible selected reference frame work out a cost.
+ for (i = 0; i < MAX_REF_FRAMES; i++) {
+ if (seg_ref_active && seg_ref_count == 1) {
+ cost = 0;
+ } else {
+ pred_flag = (i == pred_ref);
+
+ // Get the prediction for the current mb
+ cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
+ pred_flag, cpi->seg0_progress);
+ if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+
+ // for incorrectly predicted cases
+ if (! pred_flag) {
+ vp9_prob curframe_mod_refprobs[3];
+
+ if (cpi->seg0_progress) {
+ estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);
+ } else {
+ vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));
+ }
+
+ cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,
+ (i != INTRA_FRAME), cpi->seg0_progress);
+ if (i != INTRA_FRAME) {
+ cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,
+ (i != LAST_FRAME), cpi->seg0_progress);
+ if (i != LAST_FRAME) {
+ cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,
+ (i != GOLDEN_FRAME), cpi->seg0_progress);
+ }
+ }
+ }
+ }
+
+ ref_costs[i] = cost;
+ }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ int mode_index,
+ PARTITION_INFO *partition,
+ int_mv *ref_mv,
+ int_mv *second_ref_mv,
+ int single_pred_diff,
+ int comp_pred_diff,
+ int hybrid_pred_diff,
+ int64_t txfm_size_diff[NB_TXFM_MODES]) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->best_mode_index = mode_index;
+ vpx_memcpy(&ctx->mic, xd->mode_info_context,
+ sizeof(MODE_INFO));
+ if (partition)
+ vpx_memcpy(&ctx->partition_info, partition,
+ sizeof(PARTITION_INFO));
+ ctx->best_ref_mv.as_int = ref_mv->as_int;
+ ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+
+ // ctx[mb_index].rddiv = x->rddiv;
+ // ctx[mb_index].rdmult = x->rdmult;
+
+ ctx->single_pred_diff = single_pred_diff;
+ ctx->comp_pred_diff = comp_pred_diff;
+ ctx->hybrid_pred_diff = hybrid_pred_diff;
+
+ if (txfm_size_diff) {
+ memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+ } else {
+ memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
+ }
+}
+
+static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
+ int *rate2, int *distortion2, int *rate_y,
+ int *distortion, int* rate_uv, int *distortion_uv,
+ int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
+ int y_skippable, uv_skippable;
+
+ // Y cost and distortion
+ macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
+
+ *rate2 += *rate_y;
+ *distortion2 += *distortion;
+
+ // UV cost and distortion
+ if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
+ rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
+ cpi->common.full_pixel, &uv_skippable);
+ else
+ rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
+ &uv_skippable);
+ *rate2 += *rate_uv;
+ *distortion2 += *distortion_uv;
+ *skippable = y_skippable && uv_skippable;
+}
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ int idx, int frame_type,
+ int recon_yoffset, int recon_uvoffset,
+ int_mv frame_nearest_mv[4],
+ int_mv frame_near_mv[4],
+ int_mv frame_best_ref_mv[4],
+ int frame_mdcounts[4][4],
+ unsigned char *y_buffer[4],
+ unsigned char *u_buffer[4],
+ unsigned char *v_buffer[4]) {
+ YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+
+ vp9_find_near_mvs(xd, xd->mode_info_context,
+ xd->prev_mode_info_context,
+ &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],
+ &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],
+ frame_type, cpi->common.ref_frame_sign_bias);
+
+ y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
+ u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
+ v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
+
+#if CONFIG_NEWBESTREFMV
+ vp9_find_mv_refs(xd, xd->mode_info_context,
+ xd->prev_mode_info_context,
+ frame_type,
+ mbmi->ref_mvs[frame_type],
+ cpi->common.ref_frame_sign_bias);
+
+ vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
+ yv12->y_stride,
+ mbmi->ref_mvs[frame_type],
+ &frame_best_ref_mv[frame_type],
+ &frame_nearest_mv[frame_type],
+ &frame_near_mv[frame_type]);
+#endif
+}
+
+static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ enum BlockSize block_size,
+ int *saddone, int near_sadidx[],
+ int mdcounts[4], int64_t txfm_cache[],
+ int *rate2, int *distortion, int *skippable,
+ int *compmode_cost,
+ int *rate_y, int *distortion_y,
+ int *rate_uv, int *distortion_uv,
+ int *mode_excluded, int *disable_skip,
+ int recon_yoffset, int mode_index,
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+ int_mv frame_best_ref_mv[4]) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &xd->block[0];
+ const int is_comp_pred = (mbmi->second_ref_frame != 0);
+ const int num_refs = is_comp_pred ? 2 : 1;
+ const int this_mode = mbmi->mode;
+ int i;
+ int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };
+ int_mv cur_mv[2];
+ int_mv mvp;
+ int64_t this_rd = 0;
+
+ switch (this_mode) {
+ case NEWMV:
+ if (is_comp_pred) {
+ if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
+ frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+ *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
+ &frame_best_ref_mv[refs[0]],
+ XMVCOST, 96,
+ x->e_mbd.allow_high_precision_mv);
+ *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
+ &frame_best_ref_mv[refs[1]],
+ XMVCOST, 96,
+ x->e_mbd.allow_high_precision_mv);
+ } else {
+ int bestsme = INT_MAX;
+ int further_steps, step_param = cpi->sf.first_step;
+ int sadpb = x->sadperbit16;
+ int_mv mvp_full, tmp_mv;
+ // search range got from mv_pred(). It uses step_param levels. (0-7)
+ int sr = 0;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+
+ vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);
+
+ if (!*saddone) {
+ cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);
+ *saddone = 1;
+ }
+
+ vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+ mbmi->ref_frame, cpi->common.ref_frame_sign_bias,
+ &sr, &near_sadidx[0]);
+
+ mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+ mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+ // adjust search range according to sr from mv prediction
+ step_param = MAX(step_param, sr);
+
+ // Further step/diamond searches as necessary
+ further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+ bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
+ sadpb, further_steps, 1,
+ &cpi->fn_ptr[block_size],
+ &frame_best_ref_mv[refs[0]], &tmp_mv);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
+ &frame_best_ref_mv[refs[0]],
+ x->errorperbit,
+ &cpi->fn_ptr[block_size],
+ XMVCOST, &dis, &sse);
+ }
+ d->bmi.as_mv.first.as_int = tmp_mv.as_int;
+ frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
+
+ // Add the new motion vector cost to our rolling cost variable
+ *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],
+ XMVCOST, 96, xd->allow_high_precision_mv);
+ }
+ break;
+ case NEARESTMV:
+ case NEARMV:
+ // Do not bother proceeding if the vector (from newmv, nearest or
+ // near) is 0,0 as this should then be coded using the zeromv mode.
+ for (i = 0; i < num_refs; ++i)
+ if (frame_mv[this_mode][refs[i]].as_int == 0)
+ return INT64_MAX;
+ case ZEROMV:
+ default:
+ break;
+ }
+ for (i = 0; i < num_refs; ++i) {
+ cur_mv[i] = frame_mv[this_mode][refs[i]];
+ // Clip "next_nearest" so that it does not extend to far out of image
+ clamp_mv2(&cur_mv[i], xd);
+ if (mv_check_bounds(x, &cur_mv[i]))
+ return INT64_MAX;
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+
+#if CONFIG_PRED_FILTER
+ // Filtered prediction:
+ mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;
+ *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,
+ mbmi->pred_filter_enabled);
+#endif
+ if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+ const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+ const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+ *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+ }
+
+ /* We don't include the cost of the second reference here, because there
+ * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+ * words if you present them in that order, the second one is always known
+ * if the first is known */
+ *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),
+ is_comp_pred);
+ *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);
+
+ if (block_size == BLOCK_16X16) {
+ vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+ if (is_comp_pred)
+ vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
+ } else {
+#if CONFIG_SUPERBLOCKS
+ vp9_build_inter32x32_predictors_sb(xd,
+ xd->dst.y_buffer,
+ xd->dst.u_buffer,
+ xd->dst.v_buffer,
+ xd->dst.y_stride,
+ xd->dst.uv_stride);
+#endif
+ }
+
+ if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+ x->skip = 1;
+ else if (x->encode_breakout) {
+ unsigned int sse, var;
+ int threshold = (xd->block[0].dequant[1]
+ * xd->block[0].dequant[1] >> 4);
+
+ if (threshold < x->encode_breakout)
+ threshold = x->encode_breakout;
+
+ if (block_size == BLOCK_16X16) {
+ var = vp9_variance16x16(*(b->base_src), b->src_stride,
+ xd->predictor, 16, &sse);
+ } else {
+#if CONFIG_SUPERBLOCKS
+ var = vp9_variance32x32(*(b->base_src), b->src_stride,
+ xd->dst.y_buffer, xd->dst.y_stride, &sse);
+#endif
+ }
+
+ if (sse < threshold) {
+ unsigned int q2dc = xd->block[24].dequant[0];
+ /* If there is no codeable 2nd order dc
+ or a very small uniform pixel change change */
+ if ((sse - var < q2dc * q2dc >> 4) ||
+ (sse / 2 > var && sse - var < 64)) {
+ // Check u and v to make sure skip is ok
+ int sse2;
+
+ if (block_size == BLOCK_16X16) {
+ sse2 = vp9_uvsse(x);
+ } else {
+ unsigned int sse2u, sse2v;
+ var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
+ xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
+ var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
+ xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
+ sse2 = sse2u + sse2v;
+ }
+
+ if (sse2 * 2 < threshold) {
+ x->skip = 1;
+ *distortion = sse + sse2;
+ *rate2 = 500;
+
+ /* for best_yrd calculation */
+ *rate_uv = 0;
+ *distortion_uv = sse2;
+
+ *disable_skip = 1;
+ this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+ }
+ }
+ }
+ }
+
+ if (!x->skip) {
+ if (block_size == BLOCK_16X16) {
+ vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+ &xd->predictor[320], 8);
+ if (is_comp_pred)
+ vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+ &xd->predictor[320], 8);
+ inter_mode_cost(cpi, x, this_mode, rate2, distortion,
+ rate_y, distortion_y, rate_uv, distortion_uv,
+ skippable, txfm_cache);
+ } else {
+#if CONFIG_SUPERBLOCKS
+ int skippable_y, skippable_uv;
+
+ // Y cost and distortion - FIXME support other transform sizes
+ super_block_yrd_8x8(x, rate_y, distortion_y,
+ IF_RTCD(&cpi->rtcd), &skippable_y);
+ *rate2 += *rate_y;
+ *distortion += *distortion_y;
+
+ rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
+ cm->full_pixel, &skippable_uv);
+
+ *rate2 += *rate_uv;
+ *distortion += *distortion_uv;
+ *skippable = skippable_y && skippable_uv;
+#endif
+ }
+ }
+ if (is_comp_pred) {
+ *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+ } else {
+ *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+ }
+
+ return this_rd; // if 0, this will be re-calculated by caller
+}
+
+void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset, int recon_uvoffset,
+ int *returnrate, int *returndistortion,
+ int64_t *returnintra) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ union b_mode_info best_bmodes[16];
+ MB_MODE_INFO best_mbmode;
+ PARTITION_INFO best_partition;
+ int_mv best_ref_mv, second_best_ref_mv;
+ MB_PREDICTION_MODE this_mode;
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+ int i, best_mode_index = 0;
+ int mode8x8[2][4];
+ unsigned char segment_id = mbmi->segment_id;
+
+ int mode_index;
+ int mdcounts[4];
+ int rate, distortion;
+ int rate2, distortion2;
+ int64_t best_txfm_rd[NB_TXFM_MODES];
+ int64_t best_txfm_diff[NB_TXFM_MODES];
+ int64_t best_pred_diff[NB_PREDICTION_TYPES];
+ int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
+#if CONFIG_PRED_FILTER
+ int64_t best_overall_rd = INT64_MAX;
+#endif
+ int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+ int uv_intra_skippable = 0;
+ int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
+ int uv_intra_skippable_8x8 = 0;
+ int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
+ int distortion_uv = INT_MAX;
+ int64_t best_yrd = INT64_MAX;
+#if CONFIG_PRED_FILTER
+ int best_filter_state;
+#endif
+ int switchable_filter_index = 0;
+
+ MB_PREDICTION_MODE uv_intra_mode;
+ MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
+
+ int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ int saddone = 0;
+
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int_mv frame_best_ref_mv[4];
+ int frame_mdcounts[4][4];
+ unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
+
+ unsigned int ref_costs[MAX_REF_FRAMES];
+ int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
+
+ vpx_memset(mode8x8, 0, sizeof(mode8x8));
+ vpx_memset(&frame_mv, 0, sizeof(frame_mv));
+ vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+ vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+ vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
+
+ for (i = 0; i < MAX_REF_FRAMES; i++)
+ frame_mv[NEWMV][i].as_int = INVALID_MV;
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ best_pred_rd[i] = INT64_MAX;
+ for (i = 0; i < NB_TXFM_MODES; i++)
+ best_txfm_rd[i] = INT64_MAX;
+
+ for (i = 0; i < NB_PARTITIONINGS; i++) {
+ int j, k;
+
+ for (j = 0; j < 16; j++)
+ for (k = 0; k < MAX_REF_FRAMES - 1; k++)
+ seg_mvs[i][j][k].as_int = INVALID_MV;
+ }
+
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
+ recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+ frame_mv[NEARMV], frame_best_ref_mv,
+ frame_mdcounts, y_buffer, u_buffer, v_buffer);
+ }
+
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
+ recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+ frame_mv[NEARMV], frame_best_ref_mv,
+ frame_mdcounts, y_buffer, u_buffer, v_buffer);
+ }
+
+ if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+ setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
+ recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+ frame_mv[NEARMV], frame_best_ref_mv,
+ frame_mdcounts, y_buffer, u_buffer, v_buffer);
+ }
+
+ *returnintra = INT64_MAX;
+
+ x->skip = 0;
+
+ mbmi->ref_frame = INTRA_FRAME;
+
+ /* Initialize zbin mode boost for uv costing */
+ cpi->zbin_mode_boost = 0;
+ vp9_update_zbin_extra(cpi, x);
+
+ rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
+ &uv_intra_rate_tokenonly, &uv_intra_distortion,
+ &uv_intra_skippable);
+ uv_intra_mode = mbmi->uv_mode;
+
+ /* rough estimate for now */
+ if (cpi->common.txfm_mode != ONLY_4X4) {
+ rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
+ &uv_intra_rate_tokenonly_8x8,
+ &uv_intra_distortion_8x8,
+ &uv_intra_skippable_8x8);
+ uv_intra_mode_8x8 = mbmi->uv_mode;
+ }
+
+ // Get estimates of reference frame costs for each reference frame
+ // that depend on the current prediction etc.
+ estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+ for (mode_index = 0; mode_index < MAX_MODES;
+ mode_index += (!switchable_filter_index)) {
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0, skippable = 0;
+ int other_cost = 0;
+ int compmode_cost = 0;
+ int mode_excluded = 0;
+ int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
+
+ // These variables hold are rolling total cost and distortion for this mode
+ rate2 = 0;
+ distortion2 = 0;
+ rate_y = 0;
+ rate_uv = 0;
+
+ this_mode = vp9_mode_order[mode_index].mode;
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
+ mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+#if CONFIG_PRED_FILTER
+ mbmi->pred_filter_enabled = 0;
+#endif
+ if (cpi->common.mcomp_filter_type == SWITCHABLE &&
+ this_mode >= NEARESTMV && this_mode <= SPLITMV) {
+ mbmi->interp_filter =
+ vp9_switchable_interp[switchable_filter_index++];
+ if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
+ switchable_filter_index = 0;
+ } else {
+ mbmi->interp_filter = cpi->common.mcomp_filter_type;
+ }
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+ // Test best rd so far against threshold for trying this mode.
+ if (best_rd <= cpi->rd_threshes[mode_index])
+ continue;
+
+ // current coding mode under rate-distortion optimization test loop
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
+ continue;
+ // If the segment mode feature is enabled....
+ // then do nothing if the current mode is not allowed..
+ } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+ (this_mode !=
+ vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+ continue;
+ // Disable this drop out case if either the mode or ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that the we end up unable to pick any mode.
+ } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative
+ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ if (this_mode != ZEROMV ||
+ mbmi->ref_frame != ALTREF_FRAME) {
+ continue;
+ }
+ }
+ }
+
+ /* everything but intra */
+ if (mbmi->ref_frame) {
+ int ref = mbmi->ref_frame;
+
+ xd->pre.y_buffer = y_buffer[ref];
+ xd->pre.u_buffer = u_buffer[ref];
+ xd->pre.v_buffer = v_buffer[ref];
+ best_ref_mv = frame_best_ref_mv[ref];
+ vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
+ }
+
+ if (mbmi->second_ref_frame) {
+ int ref = mbmi->second_ref_frame;
+
+ xd->second_pre.y_buffer = y_buffer[ref];
+ xd->second_pre.u_buffer = u_buffer[ref];
+ xd->second_pre.v_buffer = v_buffer[ref];
+ second_best_ref_mv = frame_best_ref_mv[ref];
+ }
+
+ // Experimental code. Special case for gf and arf zeromv modes.
+ // Increase zbin size to suppress noise
+ if (cpi->zbin_mode_boost_enabled) {
+ if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
+ cpi->zbin_mode_boost = 0;
+ else {
+ if (vp9_mode_order[mode_index].mode == ZEROMV) {
+ if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ } else if (vp9_mode_order[mode_index].mode == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
+
+ vp9_update_zbin_extra(cpi, x);
+ }
+
+ // Intra
+ if (!mbmi->ref_frame) {
+ switch (this_mode) {
+ default:
+ case DC_PRED:
+ case V_PRED:
+ case H_PRED:
+ case TM_PRED:
+ case D45_PRED:
+ case D135_PRED:
+ case D117_PRED:
+ case D153_PRED:
+ case D27_PRED:
+ case D63_PRED:
+ mbmi->ref_frame = INTRA_FRAME;
+ // FIXME compound intra prediction
+ vp9_build_intra_predictors_mby(&x->e_mbd);
+ macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
+ rate2 += rate_y;
+ distortion2 += distortion;
+ rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
+ if (mbmi->txfm_size != TX_4X4) {
+ rate2 += uv_intra_rate_8x8;
+ rate_uv = uv_intra_rate_tokenonly_8x8;
+ distortion2 += uv_intra_distortion_8x8;
+ distortion_uv = uv_intra_distortion_8x8;
+ skippable = skippable && uv_intra_skippable_8x8;
+ } else {
+ rate2 += uv_intra_rate;
+ rate_uv = uv_intra_rate_tokenonly;
+ distortion2 += uv_intra_distortion;
+ distortion_uv = uv_intra_distortion;
+ skippable = skippable && uv_intra_skippable;
+ }
+ break;
+ case B_PRED: {
+ int64_t tmp_rd;
+
+ // Note the rate value returned here includes the cost of coding
+ // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
+ mbmi->txfm_size = TX_4X4;
+ tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
+#if CONFIG_COMP_INTRA_PRED
+ 0,
+#endif
+ 0);
+ rate2 += rate;
+ distortion2 += distortion;
+
+ if (tmp_rd < best_yrd) {
+ rate2 += uv_intra_rate;
+ rate_uv = uv_intra_rate_tokenonly;
+ distortion2 += uv_intra_distortion;
+ distortion_uv = uv_intra_distortion;
+ } else {
+ this_rd = INT64_MAX;
+ disable_skip = 1;
+ }
+ }
+ break;
+ case I8X8_PRED: {
+ int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
+ int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
+ int64_t tmp_rd_4x4s, tmp_rd_8x8s;
+ int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
+ int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
+ mbmi->txfm_size = TX_4X4;
+ tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
+ &d4x4, best_yrd);
+ mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+ mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+ mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+ mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+ mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+ mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+ mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+ mbmi->txfm_size = TX_8X8;
+ tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
+ &d8x8, best_yrd);
+ txfm_cache[ONLY_4X4] = tmp_rd_4x4;
+ txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
+ txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
+ tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
+ tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
+ txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;
+ if (cm->txfm_mode == TX_MODE_SELECT) {
+ if (tmp_rd_4x4s < tmp_rd_8x8s) {
+ rate = r4x4 + cost0;
+ rate_y = tok4x4 + cost0;
+ distortion = d4x4;
+ mbmi->txfm_size = TX_4X4;
+ tmp_rd = tmp_rd_4x4s;
+ } else {
+ rate = r8x8 + cost1;
+ rate_y = tok8x8 + cost1;
+ distortion = d8x8;
+ mbmi->txfm_size = TX_8X8;
+ tmp_rd = tmp_rd_8x8s;
+
+ mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+ mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+ mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+ mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+ mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+ mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+ mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+ }
+ } else if (cm->txfm_mode == ONLY_4X4) {
+ rate = r4x4;
+ rate_y = tok4x4;
+ distortion = d4x4;
+ mbmi->txfm_size = TX_4X4;
+ tmp_rd = tmp_rd_4x4;
+ } else {
+ rate = r8x8;
+ rate_y = tok8x8;
+ distortion = d8x8;
+ mbmi->txfm_size = TX_8X8;
+ tmp_rd = tmp_rd_8x8;
+
+ mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
+ mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
+ mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
+ mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+ mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+ mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+ mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+ }
+
+ rate2 += rate;
+ distortion2 += distortion;
+
+ /* TODO: uv rate maybe over-estimated here since there is UV intra
+ mode coded in I8X8_PRED prediction */
+ if (tmp_rd < best_yrd) {
+ rate2 += uv_intra_rate;
+ rate_uv = uv_intra_rate_tokenonly;
+ distortion2 += uv_intra_distortion;
+ distortion_uv = uv_intra_distortion;
+ } else {
+ this_rd = INT64_MAX;
+ disable_skip = 1;
+ }
+ }
+ break;
+ }
+ }
+ // Split MV. The code is very different from the other inter modes so
+ // special case it.
+ else if (this_mode == SPLITMV) {
+ const int is_comp_pred = mbmi->second_ref_frame != 0;
+ int64_t tmp_rd, this_rd_thresh;
+ int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
+
+ this_rd_thresh =
+ (mbmi->ref_frame == LAST_FRAME) ?
+ cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
+ this_rd_thresh =
+ (mbmi->ref_frame == GOLDEN_FRAME) ?
+ cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
+
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+ second_ref, best_yrd, mdcounts,
+ &rate, &rate_y, &distortion,
+ &skippable,
+ this_rd_thresh, seg_mvs,
+ txfm_cache);
+ rate2 += rate;
+ distortion2 += distortion;
+
+ if (cpi->common.mcomp_filter_type == SWITCHABLE)
+ rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
+ [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+ [vp9_switchable_interp_map[mbmi->interp_filter]];
+ // If even the 'Y' rd value of split is higher than best so far
+ // then dont bother looking at UV
+ if (tmp_rd < best_yrd) {
+ int uv_skippable;
+
+ rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+ cpi->common.full_pixel);
+ rate2 += rate_uv;
+ distortion2 += distortion_uv;
+ skippable = skippable && uv_skippable;
+ } else {
+ this_rd = INT64_MAX;
+ disable_skip = 1;
+ }
+
+ if (is_comp_pred)
+ mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ else
+ mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+
+ compmode_cost =
+ vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
+ mbmi->mode = this_mode;
+ }
+ else {
+ this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
+ &saddone, near_sadidx, mdcounts, txfm_cache,
+ &rate2, &distortion2, &skippable,
+ &compmode_cost, &rate_y, &distortion,
+ &rate_uv, &distortion_uv,
+ &mode_excluded, &disable_skip, recon_yoffset,
+ mode_index, frame_mv, frame_best_ref_mv);
+ if (this_rd == INT64_MAX)
+ continue;
+ }
+
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
+ rate2 += compmode_cost;
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs[mbmi->ref_frame];
+
+ if (!disable_skip) {
+ // Test for the condition where skip block will be activated
+ // because there are no non zero coefficients and make any
+ // necessary adjustment for rate. Ignore if skip is coded at
+ // segment level as the cost wont have been added in.
+ if (cpi->common.mb_no_coeff_skip) {
+ int mb_skip_allowed;
+
+ // Is Mb level skip allowed for this mb.
+ mb_skip_allowed =
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+ if (skippable) {
+ mbmi->mb_skip_coeff = 1;
+
+ // Back out the coefficient coding costs
+ rate2 -= (rate_y + rate_uv);
+ // for best_yrd calculation
+ rate_uv = 0;
+
+ if (mb_skip_allowed) {
+ int prob_skip_cost;
+
+ // Cost the skip mb case
+ vp9_prob skip_prob =
+ vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
+
+ if (skip_prob) {
+ prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+ rate2 += prob_skip_cost;
+ other_cost += prob_skip_cost;
+ }
+ }
+ }
+ // Add in the cost of the no skip flag.
+ else {
+ mbmi->mb_skip_coeff = 0;
+ if (mb_skip_allowed) {
+ int prob_skip_cost = vp9_cost_bit(
+ vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
+ rate2 += prob_skip_cost;
+ other_cost += prob_skip_cost;
+ }
+ }
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+ // Keep record of best intra distortion
+ if ((mbmi->ref_frame == INTRA_FRAME) &&
+ (this_rd < best_intra_rd)) {
+ best_intra_rd = this_rd;
+ *returnintra = distortion2;
+ }
+
+ if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+ best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
+
+#if CONFIG_PRED_FILTER
+ // Keep track of the best mode irrespective of prediction filter state
+ if (this_rd < best_overall_rd) {
+ best_overall_rd = this_rd;
+ best_filter_state = mbmi->pred_filter_enabled;
+ }
+
+ // Ignore modes where the prediction filter state doesn't
+ // match the state signaled at the frame level
+ if ((cm->pred_filter_mode == 2) ||
+ (cm->pred_filter_mode ==
+ mbmi->pred_filter_enabled)) {
+#endif
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+
+ if (this_mode <= B_PRED) {
+ if (mbmi->txfm_size != TX_4X4
+ && this_mode != B_PRED
+ && this_mode != I8X8_PRED)
+ mbmi->uv_mode = uv_intra_mode_8x8;
+ else
+ mbmi->uv_mode = uv_intra_mode;
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ }
+
+ other_cost += ref_costs[mbmi->ref_frame];
+
+ /* Calculate the final y RD estimate for this mode */
+ best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+ (distortion2 - distortion_uv));
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd = this_rd;
+ vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+ vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
+
+ if ((this_mode == B_PRED)
+ || (this_mode == I8X8_PRED)
+ || (this_mode == SPLITMV))
+ for (i = 0; i < 16; i++) {
+ best_bmodes[i] = xd->block[i].bmi;
+ }
+ }
+
+ // Testing this mode gave rise to an improvement in best error score.
+ // Lower threshold a bit for next time
+ cpi->rd_thresh_mult[mode_index] =
+ (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+ cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
+ }
+ // If the mode did not help improve the best error case then raise the
+ // threshold for testing that mode next time around.
+ else {
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip &&
+ mbmi->ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd;
+ int single_rate, hybrid_rate;
+
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (mbmi->second_ref_frame == INTRA_FRAME &&
+ single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+ best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+ } else if (mbmi->second_ref_frame != INTRA_FRAME &&
+ single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+ best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+ }
+ if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+ best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+ }
+
+ /* keep record of best txfm size */
+ if (!mode_excluded && this_rd != INT64_MAX) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ int64_t adj_rd;
+ if (this_mode != B_PRED) {
+ adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+ } else {
+ adj_rd = this_rd;
+ }
+ if (adj_rd < best_txfm_rd[i])
+ best_txfm_rd[i] = adj_rd;
+ }
+ }
+#if CONFIG_PRED_FILTER
+ }
+#endif
+
+ if (x->skip && !mode_excluded)
+ break;
+ }
+
+#if CONFIG_PRED_FILTER
+ // Update counts for prediction filter usage
+ if (best_filter_state != 0)
+ ++cpi->pred_filter_on_count;
+ else
+ ++cpi->pred_filter_off_count;
+#endif
+ if (cpi->common.mcomp_filter_type == SWITCHABLE &&
+ best_mbmode.mode >= NEARESTMV &&
+ best_mbmode.mode <= SPLITMV) {
+ ++cpi->switchable_interp_count
+ [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
+ [vp9_switchable_interp_map[best_mbmode.interp_filter]];
+ }
+
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+ (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[best_mode_index] =
+ (cpi->rd_thresh_mult[best_mode_index] >=
+ (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] =
+ (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+ cpi->rd_thresh_mult[best_mode_index];
+ }
+
+ // This code force Altref,0,0 and skip for the frame that overlays a
+ // an alrtef unless Altref is filtered. However, this is unsafe if
+ // segment level coding of ref frame or mode is enabled for this
+ // segment.
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+ cpi->is_src_frame_alt_ref &&
+ (cpi->oxcf.arnr_max_frames == 0) &&
+ (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+ mbmi->mode = ZEROMV;
+ if (cm->txfm_mode != TX_MODE_SELECT)
+ mbmi->txfm_size = cm->txfm_mode;
+ else
+ mbmi->txfm_size = TX_16X16;
+ mbmi->ref_frame = ALTREF_FRAME;
+ mbmi->mv[0].as_int = 0;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->mb_skip_coeff =
+ (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ mbmi->partitioning = 0;
+
+ vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+ vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+ goto end;
+ }
+
+ // macroblock modes
+ vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+ if (best_mbmode.mode == B_PRED) {
+ for (i = 0; i < 16; i++) {
+ xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+ xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
+ }
+ }
+
+ if (best_mbmode.mode == I8X8_PRED)
+ set_i8x8_block_modes(x, mode8x8);
+
+ if (best_mbmode.mode == SPLITMV) {
+ for (i = 0; i < 16; i++)
+ xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;
+ if (mbmi->second_ref_frame)
+ for (i = 0; i < 16; i++)
+ xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;
+
+ vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
+
+ mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
+ mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+ }
+
+ for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ if (!x->skip) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ if (best_txfm_rd[i] == INT64_MAX)
+ best_txfm_diff[i] = INT_MIN;
+ else
+ best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+ }
+ } else {
+ vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+ }
+
+end:
+ store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+ &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+ &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+ best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],
+ best_txfm_diff);
+}
+
+#if CONFIG_SUPERBLOCKS
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ int *returnrate,
+ int *returndist) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int rate_y, rate_uv;
+ int rate_y_tokenonly, rate_uv_tokenonly;
+ int error_y, error_uv;
+ int dist_y, dist_uv;
+ int y_skip, uv_skip;
+
+ xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+ error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ &dist_uv, &uv_skip);
+ error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip);
+
+ if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
+ *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+ vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+ *returndist = dist_y + (dist_uv >> 2);
+ } else {
+ *returnrate = rate_y + rate_uv;
+ if (cpi->common.mb_no_coeff_skip)
+ *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+ *returndist = dist_y + (dist_uv >> 2);
+ }
+}
+#endif
+
+void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ int *returnrate, int *returndist) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ int64_t error4x4, error16x16;
+#if CONFIG_COMP_INTRA_PRED
+ int64_t error4x4d;
+ int rate4x4d, dist4x4d;
+#endif
+ int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
+ int dist4x4, dist16x16, distuv, distuv8x8;
+ int rate;
+ int rate4x4_tokenonly = 0;
+ int rate16x16_tokenonly = 0;
+ int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
+ int64_t error8x8;
+ int rate8x8_tokenonly=0;
+ int rate8x8, dist8x8;
+ int mode16x16;
+ int mode8x8[2][4];
+ int dist;
+ int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
+ int y_intra16x16_skippable;
+ int64_t txfm_cache[NB_TXFM_MODES];
+ TX_SIZE txfm_size_16x16;
+ int i;
+
+ mbmi->ref_frame = INTRA_FRAME;
+ rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
+ &uv_intra_skippable);
+ modeuv = mbmi->uv_mode;
+ if (cpi->common.txfm_mode != ONLY_4X4) {
+ rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
+ &distuv8x8, &uv_intra_skippable_8x8);
+ modeuv8x8 = mbmi->uv_mode;
+ } else {
+ uv_intra_skippable_8x8 = uv_intra_skippable;
+ rateuv8x8 = rateuv;
+ distuv8x8 = distuv;
+ rateuv8x8_tokenonly = rateuv_tokenonly;
+ modeuv8x8 = modeuv;
+ }
+
+ // current macroblock under rate-distortion optimization test loop
+ error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
+ &rate16x16_tokenonly, &dist16x16,
+ &y_intra16x16_skippable, txfm_cache);
+ mode16x16 = mbmi->mode;
+ txfm_size_16x16 = mbmi->txfm_size;
+
+ // FIXME(rbultje) support transform-size selection
+ mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
+ error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
+ &dist8x8, error16x16);
+ mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
+ mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
+ mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
+ mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
+#if CONFIG_COMP_INTRA_PRED
+ mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
+ mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
+ mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
+ mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
+#endif
+
+ error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
+ &rate4x4, &rate4x4_tokenonly,
+ &dist4x4, error16x16,
+#if CONFIG_COMP_INTRA_PRED
+ 0,
+#endif
+ 0);
+#if CONFIG_COMP_INTRA_PRED
+ error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
+ &rate4x4d, &rate4x4_tokenonly,
+ &dist4x4d, error16x16, 1, 0);
+#endif
+
+ mbmi->mb_skip_coeff = 0;
+ if (cpi->common.mb_no_coeff_skip &&
+ y_intra16x16_skippable && uv_intra_skippable_8x8) {
+ mbmi->mb_skip_coeff = 1;
+ mbmi->mode = mode16x16;
+ mbmi->uv_mode = modeuv;
+ rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
+ vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+ dist = dist16x16 + (distuv8x8 >> 2);
+ mbmi->txfm_size = txfm_size_16x16;
+ memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+ sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+ } else if (error8x8 > error16x16) {
+ if (error4x4 < error16x16) {
+ rate = rateuv;
+#if CONFIG_COMP_INTRA_PRED
+ rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
+ if (error4x4d >= error4x4) // FIXME save original modes etc.
+ error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
+ &rate4x4_tokenonly,
+ &dist4x4, error16x16, 0,
+ cpi->update_context);
+#else
+ rate += rate4x4;
+#endif
+ mbmi->mode = B_PRED;
+ mbmi->txfm_size = TX_4X4;
+ dist = dist4x4 + (distuv >> 2);
+ memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+ sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+ } else {
+ mbmi->txfm_size = txfm_size_16x16;
+ mbmi->mode = mode16x16;
+ rate = rate16x16 + rateuv8x8;
+ dist = dist16x16 + (distuv8x8 >> 2);
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
+ }
+ }
+ if (cpi->common.mb_no_coeff_skip)
+ rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+ } else {
+ if (error4x4 < error8x8) {
+ rate = rateuv;
+#if CONFIG_COMP_INTRA_PRED
+ rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
+ if (error4x4d >= error4x4) // FIXME save original modes etc.
+ error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
+ &rate4x4_tokenonly,
+ &dist4x4, error16x16, 0,
+ cpi->update_context);
+#else
+ rate += rate4x4;
+#endif
+ mbmi->mode = B_PRED;
+ mbmi->txfm_size = TX_4X4;
+ dist = dist4x4 + (distuv >> 2);
+ memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+ sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+ } else {
+ // FIXME(rbultje) support transform-size selection
+ mbmi->mode = I8X8_PRED;
+ mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
+ set_i8x8_block_modes(x, mode8x8);
+ rate = rate8x8 + rateuv;
+ dist = dist8x8 + (distuv >> 2);
+ memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
+ sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+ }
+ if (cpi->common.mb_no_coeff_skip)
+ rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+ }
+
+ *returnrate = rate;
+ *returndist = dist;
+}
+
+#if CONFIG_SUPERBLOCKS
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset, int recon_uvoffset,
+ int *returnrate, int *returndistortion) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ MB_PREDICTION_MODE this_mode;
+ MV_REFERENCE_FRAME ref_frame;
+ unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+ int comp_pred;
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int_mv frame_best_ref_mv[4];
+ int frame_mdcounts[4][4];
+ unsigned char *y_buffer[4];
+ unsigned char *u_buffer[4];
+ unsigned char *v_buffer[4];
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
+ cpi->common.alt_fb_idx };
+ int mdcounts[4];
+ int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+ int saddone = 0;
+ int64_t best_rd = INT64_MAX;
+ int64_t best_comp_rd = INT64_MAX;
+ int64_t best_single_rd = INT64_MAX;
+ int64_t best_hybrid_rd = INT64_MAX;
+ int64_t best_yrd = INT64_MAX;
+ MB_MODE_INFO best_mbmode;
+ int mode_index, best_mode_index;
+ unsigned int ref_costs[MAX_REF_FRAMES];
+
+ x->skip = 0;
+ xd->mode_info_context->mbmi.segment_id = segment_id;
+ estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+ vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
+ recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
+ frame_mv[NEARMV], frame_best_ref_mv,
+ frame_mdcounts, y_buffer, u_buffer, v_buffer);
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ }
+
+ for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+ int mode_excluded;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int other_cost = 0;
+ int compmode_cost = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable;
+ int64_t txfm_cache[NB_TXFM_MODES];
+
+ // Test best rd so far against threshold for trying this mode.
+ if (best_rd <= cpi->rd_threshes[mode_index]) {
+ continue;
+ }
+
+ this_mode = vp9_mode_order[mode_index].mode;
+ ref_frame = vp9_mode_order[mode_index].ref_frame;
+ mbmi->ref_frame = ref_frame;
+ comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = DC_PRED;
+#if CONFIG_COMP_INTRA_PRED
+ mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+ mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+ if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+ continue;
+
+ // not yet supported or not superblocky
+ // TODO(rbultje): support intra coding
+ if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+ continue;
+
+ if (comp_pred) {
+ int second_ref;
+
+ if (ref_frame == ALTREF_FRAME) {
+ second_ref = LAST_FRAME;
+ } else {
+ second_ref = ref_frame + 1;
+ }
+ if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+ continue;
+ mbmi->second_ref_frame = second_ref;
+
+ xd->second_pre.y_buffer = y_buffer[second_ref];
+ xd->second_pre.u_buffer = u_buffer[second_ref];
+ xd->second_pre.v_buffer = v_buffer[second_ref];
+ mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ } else {
+ mbmi->second_ref_frame = INTRA_FRAME;
+ mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+ }
+
+ xd->pre.y_buffer = y_buffer[ref_frame];
+ xd->pre.u_buffer = u_buffer[ref_frame];
+ xd->pre.v_buffer = v_buffer[ref_frame];
+ vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_check_segref(xd, segment_id, ref_frame)) {
+ continue;
+ // If the segment mode feature is enabled....
+ // then do nothing if the current mode is not allowed..
+ } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+ (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+ continue;
+ // Disable this drop out case if either the mode or ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative
+ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+ continue;
+ }
+ }
+ }
+
+ this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
+ &saddone, near_sadidx, mdcounts, txfm_cache,
+ &rate2, &distortion2, &skippable,
+ &compmode_cost, &rate_y, &distortion_y,
+ &rate_uv, &distortion_uv,
+ &mode_excluded, &disable_skip, recon_yoffset,
+ mode_index, frame_mv, frame_best_ref_mv);
+ if (this_rd == INT64_MAX)
+ continue;
+
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ rate2 += compmode_cost;
+ }
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+ if (!disable_skip) {
+ // Test for the condition where skip block will be activated
+ // because there are no non zero coefficients and make any
+ // necessary adjustment for rate. Ignore if skip is coded at
+ // segment level as the cost wont have been added in.
+ if (cpi->common.mb_no_coeff_skip) {
+ int mb_skip_allowed;
+
+ // Is Mb level skip allowed for this mb.
+ mb_skip_allowed =
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+ if (skippable) {
+ // Back out the coefficient coding costs
+ rate2 -= (rate_y + rate_uv);
+ // for best_yrd calculation
+ rate_uv = 0;
+
+ if (mb_skip_allowed) {
+ int prob_skip_cost;
+
+ // Cost the skip mb case
+ vp9_prob skip_prob =
+ vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+
+ if (skip_prob) {
+ prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+ rate2 += prob_skip_cost;
+ other_cost += prob_skip_cost;
+ }
+ }
+ }
+ // Add in the cost of the no skip flag.
+ else if (mb_skip_allowed) {
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+ PRED_MBSKIP), 0);
+ rate2 += prob_skip_cost;
+ other_cost += prob_skip_cost;
+ }
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+#if 0
+ // Keep record of best intra distortion
+ if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+ (this_rd < best_intra_rd)) {
+ best_intra_rd = this_rd;
+ *returnintra = distortion2;
+ }
+#endif
+
+ if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+ if (this_rd < best_comp_rd)
+ best_comp_rd = this_rd;
+ if (this_rd < best_single_rd)
+ best_single_rd = this_rd;
+ if (this_rd < best_hybrid_rd)
+ best_hybrid_rd = this_rd;
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+
+#if 0
+ if (this_mode <= B_PRED) {
+ xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+ /* required for left and above block mv */
+ xd->mode_info_context->mbmi.mv.as_int = 0;
+ }
+#endif
+
+ other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+ /* Calculate the final y RD estimate for this mode */
+ best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+ (distortion2 - distortion_uv));
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd = this_rd;
+ vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+ }
+#if 0
+ // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+ cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+ }
+ // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+ else {
+#if 0
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
+ int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
+ best_single_rd = single_rd;
+ } else if (mbmi->second_ref_frame != INTRA_FRAME &&
+ single_rd < best_comp_rd) {
+ best_comp_rd = single_rd;
+ }
+ if (hybrid_rd < best_hybrid_rd) {
+ best_hybrid_rd = hybrid_rd;
+ }
+ }
+
+ if (x->skip && !mode_excluded)
+ break;
+ }
+
+ // TODO(rbultje) integrate with RD thresholding
+#if 0
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+ (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[best_mode_index] =
+ (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] =
+ (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+ }
+#endif
+
+ // This code forces Altref,0,0 and skip for the frame that overlays a
+ // an alrtef unless Altref is filtered. However, this is unsafe if
+ // segment level coding of ref frame or mode is enabled for this
+ // segment.
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+ !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+ cpi->is_src_frame_alt_ref &&
+ (cpi->oxcf.arnr_max_frames == 0) &&
+ (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+ mbmi->mode = ZEROMV;
+ mbmi->ref_frame = ALTREF_FRAME;
+ mbmi->second_ref_frame = 0;
+ mbmi->mv[0].as_int = 0;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+ mbmi->partitioning = 0;
+ mbmi->txfm_size = TX_8X8;
+
+ if (best_rd != INT64_MAX)
+ store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+ &frame_best_ref_mv[mbmi->ref_frame],
+ &frame_best_ref_mv[mbmi->second_ref_frame],
+ 0, 0, 0, NULL);
+ return best_rd;
+ }
+
+ // macroblock modes
+ vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+ mbmi->txfm_size = TX_8X8;
+
+ if (best_rd != INT64_MAX)
+ store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+ &frame_best_ref_mv[mbmi->ref_frame],
+ &frame_best_ref_mv[mbmi->second_ref_frame],
+ (best_single_rd == INT64_MAX) ? INT_MIN :
+ (best_rd - best_single_rd),
+ (best_comp_rd == INT64_MAX) ? INT_MIN :
+ (best_rd - best_comp_rd),
+ (best_hybrid_rd == INT64_MAX) ? INT_MIN :
+ (best_rd - best_hybrid_rd),
+ NULL);
+
+ return best_rd;
+}
+#endif
+
+void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset,
+ int recon_uvoffset,
+ int *totalrate, int *totaldist) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ int rate, distortion;
+ int64_t intra_error = 0;
+ unsigned char *segment_id = &mbmi->segment_id;
+
+ if (xd->segmentation_enabled)
+ x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
+ else
+ x->encode_breakout = cpi->oxcf.encode_breakout;
+
+ // if (cpi->sf.RD)
+ // For now this codebase is limited to a single rd encode path
+ {
+ int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
+ vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+ &distortion, &intra_error);
+
+ /* restore cpi->zbin_mode_boost_enabled */
+ cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+ }
+ // else
+ // The non rd encode path has been deleted from this code base
+ // to simplify development
+ // vp9_pick_inter_mode
+
+ // Store metrics so they can be added in to totals if this mode is picked
+ x->mb_context[xd->mb_index].distortion = distortion;
+ x->mb_context[xd->mb_index].intra_error = intra_error;
+
+ *totalrate = rate;
+ *totaldist = distortion;
+}
--- /dev/null
+++ b/vp9/encoder/rdopt.h
@@ -1,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+
+#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+
+extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);
+
+extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ int recon_yoffset, int recon_uvoffset,
+ int *returnrate, int *returndistortion,
+ int64_t *returnintra);
+
+extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ int *r, int *d);
+
+extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ int *r, int *d);
+
+extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,
+ const MODE_INFO *here, int_mv *mvp,
+ int refframe, int *ref_frame_sign_bias,
+ int *sr, int near_sadidx[]);
+
+extern void vp9_init_me_luts();
+
+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
+ MB_PREDICTION_MODE mb, int_mv *mv);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/sad_c.c
@@ -1,0 +1,480 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vp9/common/sadmxn.h"
+#include "vpx_ports/config.h"
+#include "vpx/vpx_integer.h"
+
+unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
+
+unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
+
+unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
+}
+
+
+unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
+}
+
+unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
+}
+
+
+unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int max_sad) {
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
+}
+
+void vp9_sad32x32x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array
+ ) {
+ sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad32x32x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+ ) {
+ sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad16x16x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x16x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array) {
+ sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad16x8x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x8x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array) {
+ sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad8x8x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x8x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array) {
+ sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad8x16x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x16x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array) {
+ sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad4x4x3_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr, ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x4x8_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array) {
+ sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
+}
+
+void vp9_sad32x32x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+ ) {
+ sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x16x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad16x8x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x8x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x16x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x4x4d_c(const unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr[0], ref_stride, 0x7fffffff);
+ sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr[1], ref_stride, 0x7fffffff);
+ sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr[2], ref_stride, 0x7fffffff);
+ sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
+ ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+/* Copy 2 macroblocks to a buffer */
+void vp9_copy32xn_c(unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *dst_ptr,
+ int dst_stride,
+ int height) {
+ int r;
+
+ for (r = 0; r < height; r++) {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr[4] = src_ptr[4];
+ dst_ptr[5] = src_ptr[5];
+ dst_ptr[6] = src_ptr[6];
+ dst_ptr[7] = src_ptr[7];
+ dst_ptr[8] = src_ptr[8];
+ dst_ptr[9] = src_ptr[9];
+ dst_ptr[10] = src_ptr[10];
+ dst_ptr[11] = src_ptr[11];
+ dst_ptr[12] = src_ptr[12];
+ dst_ptr[13] = src_ptr[13];
+ dst_ptr[14] = src_ptr[14];
+ dst_ptr[15] = src_ptr[15];
+ dst_ptr[16] = src_ptr[16];
+ dst_ptr[17] = src_ptr[17];
+ dst_ptr[18] = src_ptr[18];
+ dst_ptr[19] = src_ptr[19];
+ dst_ptr[20] = src_ptr[20];
+ dst_ptr[21] = src_ptr[21];
+ dst_ptr[22] = src_ptr[22];
+ dst_ptr[23] = src_ptr[23];
+ dst_ptr[24] = src_ptr[24];
+ dst_ptr[25] = src_ptr[25];
+ dst_ptr[26] = src_ptr[26];
+ dst_ptr[27] = src_ptr[27];
+ dst_ptr[28] = src_ptr[28];
+ dst_ptr[29] = src_ptr[29];
+ dst_ptr[30] = src_ptr[30];
+ dst_ptr[31] = src_ptr[31];
+#else
+ ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
+ ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
+ ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
+ ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
+ ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
+ ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
+ ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
+ ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
+#endif
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ }
+}
--- /dev/null
+++ b/vp9/encoder/satd_c.c
@@ -1,0 +1,47 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vpx_ports/mem.h"
+#include "./vpx_rtcd.h"
+unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *psatd) {
+ int r, c, i;
+ unsigned int satd = 0;
+ DECLARE_ALIGNED(16, short, diff_in[256]);
+ DECLARE_ALIGNED(16, short, diff_out[16]);
+ short *in;
+
+ for (r = 0; r < 16; r++) {
+ for (c = 0; c < 16; c++) {
+ diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ in = diff_in;
+ for (r = 0; r < 16; r += 4) {
+ for (c = 0; c < 16; c += 4) {
+ vp9_short_walsh4x4_c(in + c, diff_out, 32);
+ for (i = 0; i < 16; i++)
+ satd += abs(diff_out[i]);
+ }
+ in += 64;
+ }
+
+ if (psatd)
+ *psatd = satd;
+
+ return satd;
+}
--- /dev/null
+++ b/vp9/encoder/segmentation.c
@@ -1,0 +1,327 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "limits.h"
+#include "vpx_mem/vpx_mem.h"
+#include "segmentation.h"
+#include "vp9/common/pred_common.h"
+
+void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
+ int mb_row, mb_col;
+
+ MODE_INFO *this_mb_mode_info = cm->mi;
+
+ x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+ if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
+ // Reset Gf useage monitors
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+ } else {
+ // for each macroblock row in image
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ // for each macroblock col in image
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+
+ // If using golden then set GF active flag if not already set.
+ // If using last frame 0,0 mode then leave flag as it is
+ // else if using non 0,0 motion or intra modes then clear
+ // flag if it is currently set
+ if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||
+ (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {
+ if (*(x->gf_active_ptr) == 0) {
+ *(x->gf_active_ptr) = 1;
+ cpi->gf_active_count++;
+ }
+ } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&
+ *(x->gf_active_ptr)) {
+ *(x->gf_active_ptr) = 0;
+ cpi->gf_active_count--;
+ }
+
+ x->gf_active_ptr++; // Step onto next entry
+ this_mb_mode_info++; // skip to next mb
+
+ }
+
+ // this is to account for the border
+ this_mb_mode_info++;
+ }
+ }
+}
+
+void vp9_enable_segmentation(VP9_PTR ptr) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ // Set the appropriate feature bit
+ cpi->mb.e_mbd.segmentation_enabled = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+void vp9_disable_segmentation(VP9_PTR ptr) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ // Clear the appropriate feature bit
+ cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+void vp9_set_segmentation_map(VP9_PTR ptr,
+ unsigned char *segmentation_map) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ // Copy in the new segmentation map
+ vpx_memcpy(cpi->segmentation_map, segmentation_map,
+ (cpi->common.mb_rows * cpi->common.mb_cols));
+
+ // Signal that the map should be updated.
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+void vp9_set_segment_data(VP9_PTR ptr,
+ signed char *feature_data,
+ unsigned char abs_delta) {
+ VP9_COMP *cpi = (VP9_COMP *)(ptr);
+
+ cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
+
+ vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
+ sizeof(cpi->mb.e_mbd.segment_feature_data));
+
+ // TBD ?? Set the feature mask
+ // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
+ // sizeof(cpi->mb.e_mbd.segment_feature_mask));
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(MACROBLOCKD *xd,
+ int *segcounts,
+ vp9_prob *segment_tree_probs) {
+ int count1, count2;
+ int tot_count;
+ int i;
+
+ // Blank the strtucture to start with
+ vpx_memset(segment_tree_probs, 0,
+ MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
+
+ // Total count for all segments
+ count1 = segcounts[0] + segcounts[1];
+ count2 = segcounts[2] + segcounts[3];
+ tot_count = count1 + count2;
+
+ // Work out probabilities of each segment
+ if (tot_count)
+ segment_tree_probs[0] = (count1 * 255) / tot_count;
+ if (count1 > 0)
+ segment_tree_probs[1] = (segcounts[0] * 255) / count1;
+ if (count2 > 0)
+ segment_tree_probs[2] = (segcounts[2] * 255) / count2;
+
+ // Clamp probabilities to minimum allowed value
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
+ if (segment_tree_probs[i] == 0)
+ segment_tree_probs[i] = 1;
+ }
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(MACROBLOCKD *xd,
+ int *segcounts,
+ vp9_prob *probs) {
+ int cost;
+ int count1, count2;
+
+ // Cost the top node of the tree
+ count1 = segcounts[0] + segcounts[1];
+ count2 = segcounts[2] + segcounts[3];
+ cost = count1 * vp9_cost_zero(probs[0]) +
+ count2 * vp9_cost_one(probs[0]);
+
+ // Now add the cost of each individual segment branch
+ if (count1 > 0)
+ cost += segcounts[0] * vp9_cost_zero(probs[1]) +
+ segcounts[1] * vp9_cost_one(probs[1]);
+
+ if (count2 > 0)
+ cost += segcounts[2] * vp9_cost_zero(probs[2]) +
+ segcounts[3] * vp9_cost_one(probs[2]);
+
+ return cost;
+
+}
+
+void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+ const int mis = cm->mode_info_stride;
+ int i;
+ int tot_count;
+ int no_pred_cost;
+ int t_pred_cost = INT_MAX;
+ int pred_context;
+
+ int mb_row, mb_col;
+ int segmap_index = 0;
+ unsigned char segment_id;
+
+ int temporal_predictor_count[PREDICTION_PROBS][2];
+ int no_pred_segcounts[MAX_MB_SEGMENTS];
+ int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+
+ vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
+ vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
+ vp9_prob t_nopred_prob[PREDICTION_PROBS];
+
+ // Set default state for the segment tree probabilities and the
+ // temporal coding probabilities
+ vpx_memset(xd->mb_segment_tree_probs, 255,
+ sizeof(xd->mb_segment_tree_probs));
+ vpx_memset(cm->segment_pred_probs, 255,
+ sizeof(cm->segment_pred_probs));
+
+ vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
+ vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
+ vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));
+
+ // First of all generate stats regarding how well the last segment map
+ // predicts this one
+
+ // Initialize macroblock decoder mode info context for the first mb
+ // in the frame
+ xd->mode_info_context = cm->mi;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+ for (i = 0; i < 4; i++) {
+ static const int dx[4] = { +1, -1, +1, +1 };
+ static const int dy[4] = { 0, +1, 0, -1 };
+ int x_idx = i & 1, y_idx = i >> 1;
+
+ if (mb_col + x_idx >= cm->mb_cols ||
+ mb_row + y_idx >= cm->mb_rows) {
+ goto end;
+ }
+
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;
+
+ segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+ segment_id = xd->mode_info_context->mbmi.segment_id;
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ if (mb_col + 1 < cm->mb_cols)
+ segment_id = segment_id &&
+ xd->mode_info_context[1].mbmi.segment_id;
+ if (mb_row + 1 < cm->mb_rows) {
+ segment_id = segment_id &&
+ xd->mode_info_context[mis].mbmi.segment_id;
+ if (mb_col + 1 < cm->mb_cols)
+ segment_id = segment_id &&
+ xd->mode_info_context[mis + 1].mbmi.segment_id;
+ }
+ }
+#endif
+
+ // Count the number of hits on each segment with no prediction
+ no_pred_segcounts[segment_id]++;
+
+ // Temporal prediction not allowed on key frames
+ if (cm->frame_type != KEY_FRAME) {
+ // Test to see if the segment id matches the predicted value.
+ int seg_predicted =
+ (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+
+ // Get the segment id prediction context
+ pred_context =
+ vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+
+ // Store the prediction status for this mb and update counts
+ // as appropriate
+ vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+ temporal_predictor_count[pred_context][seg_predicted]++;
+
+ if (!seg_predicted)
+ // Update the "unpredicted" segment count
+ t_unpred_seg_counts[segment_id]++;
+ }
+
+#if CONFIG_SUPERBLOCKS
+ if (xd->mode_info_context->mbmi.encoded_as_sb) {
+ assert(!i);
+ xd->mode_info_context += 2;
+ break;
+ }
+#endif
+ end:
+ xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+ }
+ }
+
+ // this is to account for the border in mode_info_context
+ xd->mode_info_context -= mb_col;
+ xd->mode_info_context += cm->mode_info_stride * 2;
+ }
+
+ // Work out probability tree for coding segments without prediction
+ // and the cost.
+ calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);
+ no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);
+
+ // Key frames cannot use temporal prediction
+ if (cm->frame_type != KEY_FRAME) {
+ // Work out probability tree for coding those segments not
+ // predicted using the temporal method and the cost.
+ calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
+ t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
+
+ // Add in the cost of the signalling for each prediction context
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ tot_count = temporal_predictor_count[i][0] +
+ temporal_predictor_count[i][1];
+
+ // Work out the context probabilities for the segment
+ // prediction flag
+ if (tot_count) {
+ t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /
+ tot_count;
+
+ // Clamp to minimum allowed value
+ if (t_nopred_prob[i] < 1)
+ t_nopred_prob[i] = 1;
+ } else
+ t_nopred_prob[i] = 1;
+
+ // Add in the predictor signaling cost
+ t_pred_cost += (temporal_predictor_count[i][0] *
+ vp9_cost_zero(t_nopred_prob[i])) +
+ (temporal_predictor_count[i][1] *
+ vp9_cost_one(t_nopred_prob[i]));
+ }
+ }
+
+ // Now choose which coding method to use.
+ if (t_pred_cost < no_pred_cost) {
+ cm->temporal_update = 1;
+ vpx_memcpy(xd->mb_segment_tree_probs,
+ t_pred_tree, sizeof(t_pred_tree));
+ vpx_memcpy(&cm->segment_pred_probs,
+ t_nopred_prob, sizeof(t_nopred_prob));
+ } else {
+ cm->temporal_update = 0;
+ vpx_memcpy(xd->mb_segment_tree_probs,
+ no_pred_tree, sizeof(no_pred_tree));
+ }
+}
--- /dev/null
+++ b/vp9/encoder/segmentation.h
@@ -1,0 +1,46 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "vp9/common/blockd.h"
+#include "onyx_int.h"
+
+#ifndef __INC_SEGMENTATION_H__
+#define __INC_SEGMENTATION_H__ 1
+
+extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,
+ MACROBLOCK *x);
+
+extern void vp9_enable_segmentation(VP9_PTR ptr);
+extern void vp9_disable_segmentation(VP9_PTR ptr);
+
+// Valid values for a segment are 0 to 3
+// Segmentation map is arrange as [Rows][Columns]
+extern void vp9_set_segmentation_map(VP9_PTR ptr,
+ unsigned char *segmentation_map);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+//
+extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+ unsigned char abs_delta);
+
+extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
+
+#endif /* __INC_SEGMENTATION_H__ */
--- /dev/null
+++ b/vp9/encoder/ssim.c
@@ -1,0 +1,147 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+
+void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
+ int rp, unsigned long *sum_s, unsigned long *sum_r,
+ unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+ unsigned long *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 16; i++, s += sp, r += rp) {
+ for (j = 0; j < 16; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+ unsigned long *sum_s, unsigned long *sum_r,
+ unsigned long *sum_sq_s, unsigned long *sum_sq_r,
+ unsigned long *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+const static int64_t cc1 = 26634; // (64^2*(.01*255)^2
+const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double similarity(unsigned long sum_s, unsigned long sum_r,
+ unsigned long sum_sq_s, unsigned long sum_sq_r,
+ unsigned long sum_sxr, int count) {
+ int64_t ssim_n, ssim_d;
+ int64_t c1, c2;
+
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+
+ ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
+ (int64_t) 2 * sum_s * sum_r + c2);
+
+ ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+ ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+ (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
+ unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+ unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+ int stride_img2, int width, int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j < width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ int lumamask, double *weight) {
+ double a, b, c;
+ double ssimv;
+
+ a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width,
+ source->y_height);
+
+ b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+ double *ssim_y, double *ssim_u, double *ssim_v) {
+ double ssim_all = 0;
+ double a, b, c;
+
+ a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width,
+ source->y_height);
+
+ b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+ *ssim_y = a;
+ *ssim_u = b;
+ *ssim_v = c;
+ ssim_all = (a * 4 + b + c) / 6;
+
+ return ssim_all;
+}
--- /dev/null
+++ b/vp9/encoder/temporal_filter.c
@@ -1,0 +1,516 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp9/common/systemdependent.h"
+#include "quantize.h"
+#include "vp9/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp9/common/extend.h"
+#include "ratectrl.h"
+#include "vp9/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/swapyv12buffer.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+
+#if VP9_TEMPORAL_ALT_REF
+
+
+static void temporal_filter_predictors_mb_c
+(
+ MACROBLOCKD *xd,
+ unsigned char *y_mb_ptr,
+ unsigned char *u_mb_ptr,
+ unsigned char *v_mb_ptr,
+ int stride,
+ int mv_row,
+ int mv_col,
+ unsigned char *pred
+) {
+ int offset;
+ unsigned char *yptr, *uptr, *vptr;
+ int omv_row, omv_col;
+
+ // Y
+ yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7) {
+ xd->subpixel_predict16x16(yptr, stride,
+ (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
+ } else {
+ vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
+ }
+
+ // U & V
+ omv_row = mv_row;
+ omv_col = mv_col;
+ mv_row >>= 1;
+ mv_col >>= 1;
+ stride = (stride + 1) >> 1;
+ offset = (mv_row >> 3) * stride + (mv_col >> 3);
+ uptr = u_mb_ptr + offset;
+ vptr = v_mb_ptr + offset;
+
+ if ((omv_row | omv_col) & 15) {
+ xd->subpixel_predict8x8(uptr, stride,
+ (omv_col & 15), (omv_row & 15), &pred[256], 8);
+ xd->subpixel_predict8x8(vptr, stride,
+ (omv_col & 15), (omv_row & 15), &pred[320], 8);
+ }
+ else {
+ vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
+ vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
+ }
+}
+void vp9_temporal_filter_apply_c
+(
+ unsigned char *frame1,
+ unsigned int stride,
+ unsigned char *frame2,
+ unsigned int block_size,
+ int strength,
+ int filter_weight,
+ unsigned int *accumulator,
+ unsigned short *count
+) {
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+
+ for (i = 0, k = 0; i < block_size; i++) {
+ for (j = 0; j < block_size; j++, k++) {
+
+ int src_byte = frame1[byte];
+ int pixel_value = *frame2++;
+
+ modifier = src_byte - pixel_value;
+ // This is an integer approximation of:
+ // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+ // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
+ modifier *= modifier;
+ modifier *= 3;
+ modifier += 1 << (strength - 1);
+ modifier >>= strength;
+
+ if (modifier > 16)
+ modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_size;
+ }
+}
+
+#if ALT_REF_MC_ENABLED
+
+static int temporal_filter_find_matching_mb_c
+(
+ VP9_COMP *cpi,
+ YV12_BUFFER_CONFIG *arf_frame,
+ YV12_BUFFER_CONFIG *frame_ptr,
+ int mb_offset,
+ int error_thresh
+) {
+ MACROBLOCK *x = &cpi->mb;
+ int step_param;
+ int further_steps;
+ int sadpb = x->sadperbit16;
+ int bestsme = INT_MAX;
+
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ int_mv best_ref_mv1;
+ int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ // Save input state
+ unsigned char **base_src = b->base_src;
+ int src = b->src;
+ int src_stride = b->src_stride;
+ unsigned char **base_pre = d->base_pre;
+ int pre = d->pre;
+ int pre_stride = d->pre_stride;
+
+ best_ref_mv1.as_int = 0;
+ best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
+ best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
+
+ // Setup frame pointers
+ b->base_src = &arf_frame->y_buffer;
+ b->src_stride = arf_frame->y_stride;
+ b->src = mb_offset;
+
+ d->base_pre = &frame_ptr->y_buffer;
+ d->pre_stride = frame_ptr->y_stride;
+ d->pre = mb_offset;
+
+ // Further step/diamond searches as necessary
+ if (cpi->Speed < 8) {
+ step_param = cpi->sf.first_step +
+ ((cpi->Speed > 5) ? 1 : 0);
+ further_steps =
+ (cpi->sf.max_step_search_steps - 1) - step_param;
+ } else {
+ step_param = cpi->sf.first_step + 2;
+ further_steps = 0;
+ }
+
+ /*cpi->sf.search_method == HEX*/
+ // TODO Check that the 16x16 vf & sdf are selected here
+ // Ignore mv costing by sending NULL pointer instead of cost arrays
+ bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,
+ step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
+ NULLMVCOST, NULLMVCOST,
+ &best_ref_mv1);
+
+#if ALT_REF_SUBPEL_ENABLED
+ // Try sub-pixel MC?
+ // if (bestsme > error_thresh && bestsme < INT_MAX)
+ {
+ int distortion;
+ unsigned int sse;
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,
+ &best_ref_mv1,
+ x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ NULLMVCOST,
+ &distortion, &sse);
+ }
+#endif
+
+ // Save input state
+ b->base_src = base_src;
+ b->src = src;
+ b->src_stride = src_stride;
+ d->base_pre = base_pre;
+ d->pre = pre;
+ d->pre_stride = pre_stride;
+
+ return bestsme;
+}
+#endif
+
+static void temporal_filter_iterate_c
+(
+ VP9_COMP *cpi,
+ int frame_count,
+ int alt_ref_index,
+ int strength
+) {
+ int byte;
+ int frame;
+ int mb_col, mb_row;
+ unsigned int filter_weight;
+ int mb_cols = cpi->common.mb_cols;
+ int mb_rows = cpi->common.mb_rows;
+ int mb_y_offset = 0;
+ int mb_uv_offset = 0;
+ DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+ unsigned char *dst1, *dst2;
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16 * 16 + 8 * 8 + 8 * 8);
+
+ // Save input state
+ unsigned char *y_buffer = mbd->pre.y_buffer;
+ unsigned char *u_buffer = mbd->pre.u_buffer;
+ unsigned char *v_buffer = mbd->pre.v_buffer;
+
+ for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+#if ALT_REF_MC_ENABLED
+ // Source frames are extended to 16 pixels. This is different than
+ // L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+ // A 6/8 tap filter is used for motion search. This requires 2 pixels
+ // before and 3 pixels after. So the largest Y mv on a border would
+ // then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and
+ // therefore only extended by 8. The largest mv that a UV block
+ // can support is 8 - INTERP_EXTEND. A UV mv is half of a Y mv.
+ // (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.
+ // To keep the mv in play for both Y and UV planes the max that it
+ // can be on a border is therefore 16 - (2*INTERP_EXTEND+1).
+ cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));
+ cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+ + (17 - 2 * INTERP_EXTEND);
+#endif
+
+ for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+ int i, j, k;
+ int stride;
+
+ vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
+ vpx_memset(count, 0, 384 * sizeof(unsigned short));
+
+#if ALT_REF_MC_ENABLED
+ cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));
+ cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+ + (17 - 2 * INTERP_EXTEND);
+#endif
+
+ for (frame = 0; frame < frame_count; frame++) {
+ if (cpi->frames[frame] == NULL)
+ continue;
+
+ mbd->block[0].bmi.as_mv.first.as_mv.row = 0;
+ mbd->block[0].bmi.as_mv.first.as_mv.col = 0;
+
+ if (frame == alt_ref_index) {
+ filter_weight = 2;
+ } else {
+ int err = 0;
+#if ALT_REF_MC_ENABLED
+#define THRESH_LOW 10000
+#define THRESH_HIGH 20000
+
+ // Find best match in this frame by MC
+ err = temporal_filter_find_matching_mb_c
+ (cpi,
+ cpi->frames[alt_ref_index],
+ cpi->frames[frame],
+ mb_y_offset,
+ THRESH_LOW);
+#endif
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ filter_weight = err < THRESH_LOW
+ ? 2 : err < THRESH_HIGH ? 1 : 0;
+ }
+
+ if (filter_weight != 0) {
+ // Construct the predictors
+ temporal_filter_predictors_mb_c
+ (mbd,
+ cpi->frames[frame]->y_buffer + mb_y_offset,
+ cpi->frames[frame]->u_buffer + mb_uv_offset,
+ cpi->frames[frame]->v_buffer + mb_uv_offset,
+ cpi->frames[frame]->y_stride,
+ mbd->block[0].bmi.as_mv.first.as_mv.row,
+ mbd->block[0].bmi.as_mv.first.as_mv.col,
+ predictor);
+
+ // Apply the filter (YUV)
+ TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+ (f->y_buffer + mb_y_offset,
+ f->y_stride,
+ predictor,
+ 16,
+ strength,
+ filter_weight,
+ accumulator,
+ count);
+
+ TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+ (f->u_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 256,
+ 8,
+ strength,
+ filter_weight,
+ accumulator + 256,
+ count + 256);
+
+ TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)
+ (f->v_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 320,
+ 8,
+ strength,
+ filter_weight,
+ accumulator + 320,
+ count + 320);
+ }
+ }
+
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= cpi->fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1[byte] = (unsigned char)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < 8; i++) {
+ for (j = 0; j < 8; j++, k++) {
+ int m = k + 64;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= cpi->fixed_divide[count[k]];
+ pval >>= 19;
+ dst1[byte] = (unsigned char)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= cpi->fixed_divide[count[m]];
+ pval >>= 19;
+ dst2[byte] = (unsigned char)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - 8;
+ }
+
+ mb_y_offset += 16;
+ mb_uv_offset += 8;
+ }
+
+ mb_y_offset += 16 * (f->y_stride - mb_cols);
+ mb_uv_offset += 8 * (f->uv_stride - mb_cols);
+ }
+
+ // Restore input state
+ mbd->pre.y_buffer = y_buffer;
+ mbd->pre.u_buffer = u_buffer;
+ mbd->pre.v_buffer = v_buffer;
+}
+
+void vp9_temporal_filter_prepare_c
+(
+ VP9_COMP *cpi,
+ int distance
+) {
+ int frame = 0;
+
+ int num_frames_backward = 0;
+ int num_frames_forward = 0;
+ int frames_to_blur_backward = 0;
+ int frames_to_blur_forward = 0;
+ int frames_to_blur = 0;
+ int start_frame = 0;
+
+ int strength = cpi->oxcf.arnr_strength;
+
+ int blur_type = cpi->oxcf.arnr_type;
+
+ int max_frames = cpi->active_arnr_frames;
+
+ num_frames_backward = distance;
+ num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
+ - (num_frames_backward + 1);
+
+ switch (blur_type) {
+ case 1:
+ /////////////////////////////////////////
+ // Backward Blur
+
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_backward >= max_frames)
+ frames_to_blur_backward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_backward + 1;
+ break;
+
+ case 2:
+ /////////////////////////////////////////
+ // Forward Blur
+
+ frames_to_blur_forward = num_frames_forward;
+
+ if (frames_to_blur_forward >= max_frames)
+ frames_to_blur_forward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_forward + 1;
+ break;
+
+ case 3:
+ default:
+ /////////////////////////////////////////
+ // Center Blur
+ frames_to_blur_forward = num_frames_forward;
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_forward > frames_to_blur_backward)
+ frames_to_blur_forward = frames_to_blur_backward;
+
+ if (frames_to_blur_backward > frames_to_blur_forward)
+ frames_to_blur_backward = frames_to_blur_forward;
+
+ // When max_frames is even we have 1 more frame backward than forward
+ if (frames_to_blur_forward > (max_frames - 1) / 2)
+ frames_to_blur_forward = ((max_frames - 1) / 2);
+
+ if (frames_to_blur_backward > (max_frames / 2))
+ frames_to_blur_backward = (max_frames / 2);
+
+ frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+ break;
+ }
+
+ start_frame = distance + frames_to_blur_forward;
+
+#ifdef DEBUGFWG
+ // DEBUG FWG
+ printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
+, max_frames
+, num_frames_backward
+, num_frames_forward
+, frames_to_blur
+, frames_to_blur_backward
+, frames_to_blur_forward
+, cpi->source_encode_index
+, cpi->last_alt_ref_sei
+, start_frame);
+#endif
+
+ // Setup frame pointers, NULL indicates frame not included in filter
+ vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
+ for (frame = 0; frame < frames_to_blur; frame++) {
+ int which_buffer = start_frame - frame;
+ struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
+ which_buffer);
+ cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+ }
+
+ temporal_filter_iterate_c(
+ cpi,
+ frames_to_blur,
+ frames_to_blur_backward,
+ strength);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/temporal_filter.h
@@ -1,0 +1,47 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TEMPORAL_FILTER_H
+#define __INC_TEMPORAL_FILTER_H
+
+#define prototype_apply(sym)\
+ void (sym) \
+ ( \
+ unsigned char *frame1, \
+ unsigned int stride, \
+ unsigned char *frame2, \
+ unsigned int block_size, \
+ int strength, \
+ int filter_weight, \
+ unsigned int *accumulator, \
+ unsigned short *count \
+ )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
+#ifndef vp9_temporal_filter_apply
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+#endif
+extern prototype_apply(vp9_temporal_filter_apply);
+
+typedef struct {
+ prototype_apply(*apply);
+} vp9_temporal_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn
+#endif
+
+#endif // __INC_TEMPORAL_FILTER_H
--- /dev/null
+++ b/vp9/encoder/tokenize.c
@@ -1,0 +1,868 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/pred_common.h"
+#include "vp9/common/seg_common.h"
+#include "vp9/common/entropy.h"
+
+/* Global event counters used for accumulating statistics across several
+ compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+
+extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
+extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
+extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+#endif /* ENTROPY_STATS */
+
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
+void vp9_fix_contexts(MACROBLOCKD *xd);
+
+static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
+const TOKENVALUE *vp9_dct_value_tokens_ptr;
+static int dct_value_cost[DCT_MAX_VALUE * 2];
+const int *vp9_dct_value_cost_ptr;
+
+static void fill_value_tokens() {
+
+ TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
+ vp9_extra_bit_struct *const e = vp9_extra_bits;
+
+ int i = -DCT_MAX_VALUE;
+ int sign = 1;
+
+ do {
+ if (!i)
+ sign = 0;
+
+ {
+ const int a = sign ? -i : i;
+ int eb = sign;
+
+ if (a > 4) {
+ int j = 4;
+
+ while (++j < 11 && e[j].base_val <= a) {}
+
+ t[i].Token = --j;
+ eb |= (a - e[j].base_val) << 1;
+ } else
+ t[i].Token = a;
+
+ t[i].Extra = eb;
+ }
+
+ // initialize the cost for extra bits for all possible coefficient value.
+ {
+ int cost = 0;
+ vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;
+
+ if (p->base_val) {
+ const int extra = t[i].Extra;
+ const int Length = p->Len;
+
+ if (Length)
+ cost += treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+ cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
+ dct_value_cost[i + DCT_MAX_VALUE] = cost;
+ }
+
+ }
+
+ } while (++i < DCT_MAX_VALUE);
+
+ vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+ vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+}
+
+static void tokenize_b(VP9_COMP *cpi,
+ MACROBLOCKD *xd,
+ const BLOCKD * const b,
+ TOKENEXTRA **tp,
+ PLANE_TYPE type,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ TX_SIZE tx_size,
+ int dry_run) {
+ int pt; /* near block/prev token context index */
+ int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+ const int eob = b->eob; /* one beyond last nonzero coeff */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ const short *qcoeff_ptr = b->qcoeff;
+ int seg_eob;
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+ const int *bands, *scan;
+ unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+ vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+ const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+ get_tx_type(xd, b) : DCT_DCT;
+
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ switch (tx_size) {
+ default:
+ case TX_4X4:
+ seg_eob = 16;
+ bands = vp9_coef_bands;
+ scan = vp9_default_zig_zag1d;
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts;
+ probs = cpi->common.fc.hybrid_coef_probs;
+ if (tx_type == ADST_DCT) {
+ scan = vp9_row_scan;
+ } else if (tx_type == DCT_ADST) {
+ scan = vp9_col_scan;
+ }
+ } else {
+ counts = cpi->coef_counts;
+ probs = cpi->common.fc.coef_probs;
+ }
+ break;
+ case TX_8X8:
+ if (type == PLANE_TYPE_Y2) {
+ seg_eob = 4;
+ bands = vp9_coef_bands;
+ scan = vp9_default_zig_zag1d;
+ } else {
+ seg_eob = 64;
+ bands = vp9_coef_bands_8x8;
+ scan = vp9_default_zig_zag1d_8x8;
+ }
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts_8x8;
+ probs = cpi->common.fc.hybrid_coef_probs_8x8;
+ } else {
+ counts = cpi->coef_counts_8x8;
+ probs = cpi->common.fc.coef_probs_8x8;
+ }
+ break;
+ case TX_16X16:
+ seg_eob = 256;
+ bands = vp9_coef_bands_16x16;
+ scan = vp9_default_zig_zag1d_16x16;
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts_16x16;
+ probs = cpi->common.fc.hybrid_coef_probs_16x16;
+ } else {
+ counts = cpi->coef_counts_16x16;
+ probs = cpi->common.fc.coef_probs_16x16;
+ }
+ break;
+ }
+
+ if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
+ seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+ do {
+ const int band = bands[c];
+ int token;
+
+ if (c < eob) {
+ const int rc = scan[c];
+ const int v = qcoeff_ptr[rc];
+
+ assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
+
+ t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
+ token = vp9_dct_value_tokens_ptr[v].Token;
+ } else {
+ token = DCT_EOB_TOKEN;
+ }
+
+ t->Token = token;
+ t->context_tree = probs[type][band][pt];
+ t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
+ (band > 1 && type == PLANE_TYPE_Y_NO_DC));
+ assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
+ if (!dry_run) {
+ ++counts[type][band][pt][token];
+ }
+ pt = vp9_prev_token_class[token];
+ ++t;
+ } while (c < eob && ++c < seg_eob);
+
+ *tp = t;
+ *a = *l = (c != !type); /* 0 <-> all coeff data is zero */
+}
+
+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
+ int skip = 1;
+ int i = 0;
+
+ if (has_y2_block) {
+ for (i = 0; i < 16; i++)
+ skip &= (xd->block[i].eob < 2);
+ skip &= (!xd->block[24].eob);
+ } else {
+ for (i = 0; i < 16; i++)
+ skip &= (!xd->block[i].eob);
+ }
+ return skip;
+}
+
+int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
+ int skip = 1;
+ int i;
+
+ for (i = 16; i < 24; i++)
+ skip &= (!xd->block[i].eob);
+ return skip;
+}
+
+static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {
+ return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &
+ vp9_mbuv_is_skippable_4x4(xd));
+}
+
+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
+ int skip = 1;
+ int i = 0;
+
+ if (has_y2_block) {
+ for (i = 0; i < 16; i += 4)
+ skip &= (xd->block[i].eob < 2);
+ skip &= (!xd->block[24].eob);
+ } else {
+ for (i = 0; i < 16; i += 4)
+ skip &= (!xd->block[i].eob);
+ }
+ return skip;
+}
+
+int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
+ return (!xd->block[16].eob) & (!xd->block[20].eob);
+}
+
+static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {
+ return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
+ vp9_mbuv_is_skippable_8x8(xd));
+}
+
+static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {
+ return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &
+ vp9_mbuv_is_skippable_4x4(xd));
+}
+
+int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
+ int skip = 1;
+ skip &= !xd->block[0].eob;
+ return skip;
+}
+
+static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
+ return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
+}
+
+void vp9_tokenize_mb(VP9_COMP *cpi,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **t,
+ int dry_run) {
+ PLANE_TYPE plane_type;
+ int has_y2_block;
+ int b;
+ int tx_size = xd->mode_info_context->mbmi.txfm_size;
+ int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
+ TOKENEXTRA *t_backup = *t;
+ ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;
+ ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;
+
+ // If the MB is going to be skipped because of a segment level flag
+ // exclude this from the skip count stats used to calculate the
+ // transmitted skip probability;
+ int skip_inc;
+ int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+ (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {
+ skip_inc = 1;
+ } else
+ skip_inc = 0;
+
+ has_y2_block = (tx_size != TX_16X16
+ && xd->mode_info_context->mbmi.mode != B_PRED
+ && xd->mode_info_context->mbmi.mode != I8X8_PRED
+ && xd->mode_info_context->mbmi.mode != SPLITMV);
+
+ switch (tx_size) {
+ case TX_16X16:
+ xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
+ break;
+ case TX_8X8:
+ if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+ xd->mode_info_context->mbmi.mode == SPLITMV)
+ xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);
+ else
+ xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);
+ break;
+
+ default:
+ xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);
+ break;
+ }
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+ if (!dry_run)
+ cpi->skip_true_count[mb_skip_context] += skip_inc;
+ if (!cpi->common.mb_no_coeff_skip) {
+ vp9_stuff_mb(cpi, xd, t, dry_run);
+ } else {
+ vp9_fix_contexts(xd);
+ }
+ if (dry_run)
+ *t = t_backup;
+ return;
+ }
+
+ if (!dry_run)
+ cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+ if (has_y2_block) {
+ if (tx_size == TX_8X8) {
+ tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+ A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
+ TX_8X8, dry_run);
+ } else {
+ tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+ A + vp9_block2above[24], L + vp9_block2left[24],
+ TX_4X4, dry_run);
+ }
+
+ plane_type = PLANE_TYPE_Y_NO_DC;
+ } else
+ plane_type = PLANE_TYPE_Y_WITH_DC;
+
+ if (tx_size == TX_16X16) {
+ tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+ A, L, TX_16X16, dry_run);
+ A[1] = A[2] = A[3] = A[0];
+ L[1] = L[2] = L[3] = L[0];
+
+ for (b = 16; b < 24; b += 4) {
+ tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+ A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+ TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+ vpx_memset(&A[8], 0, sizeof(A[8]));
+ vpx_memset(&L[8], 0, sizeof(L[8]));
+ } else if (tx_size == TX_8X8) {
+ for (b = 0; b < 16; b += 4) {
+ tokenize_b(cpi, xd, xd->block + b, t, plane_type,
+ A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+ TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+ if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+ xd->mode_info_context->mbmi.mode == SPLITMV) {
+ for (b = 16; b < 24; b++) {
+ tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+ A + vp9_block2above[b], L + vp9_block2left[b],
+ TX_4X4, dry_run);
+ }
+ } else {
+ for (b = 16; b < 24; b += 4) {
+ tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+ A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+ TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+ }
+ } else {
+ for (b = 0; b < 16; b++) {
+ tokenize_b(cpi, xd, xd->block + b, t, plane_type,
+ A + vp9_block2above[b], L + vp9_block2left[b],
+ TX_4X4, dry_run);
+ }
+
+ for (b = 16; b < 24; b++) {
+ tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+ A + vp9_block2above[b], L + vp9_block2left[b],
+ TX_4X4, dry_run);
+ }
+ }
+ if (dry_run)
+ *t = t_backup;
+}
+
+
+#ifdef ENTROPY_STATS
+void init_context_counters(void) {
+ FILE *f = fopen("context.bin", "rb");
+ if (!f) {
+ vpx_memset(context_counters, 0, sizeof(context_counters));
+ vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
+ vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
+ } else {
+ fread(context_counters, sizeof(context_counters), 1, f);
+ fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+ fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+ fclose(f);
+ }
+
+ f = fopen("treeupdate.bin", "rb");
+ if (!f) {
+ vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
+ vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
+ vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
+ } else {
+ fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
+ fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+ fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+ fclose(f);
+ }
+}
+
+void print_context_counters() {
+ int type, band, pt, t;
+ FILE *f = fopen("context.c", "w");
+
+ fprintf(f, "#include \"entropy.h\"\n");
+ fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+ fprintf(f, "static const unsigned int\n"
+ "vp9_default_coef_counts[BLOCK_TYPES]\n"
+ " [COEF_BANDS]\n"
+ " [PREV_COEF_CONTEXTS]\n"
+ " [MAX_ENTROPY_TOKENS]={\n");
+
+# define Comma( X) (X? ",":"")
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+ do {
+ const INT64 x = context_counters [type] [band] [pt] [t];
+ const int y = (int) x;
+ assert(x == (INT64) y); /* no overflow handling yet */
+ fprintf(f, "%s %d", Comma(t), y);
+ } while (++t < MAX_ENTROPY_TOKENS);
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+ fprintf(f, "\n }");
+ } while (++band < COEF_BANDS);
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES);
+ fprintf(f, "\n};\n");
+
+ fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"
+ "[BLOCK_TYPES_8X8] [COEF_BANDS]"
+ "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ fprintf(f, "%s\n {", Comma(pt));
+ t = 0;
+ do {
+ const INT64 x = context_counters_8x8 [type] [band] [pt] [t];
+ const int y = (int) x;
+
+ assert(x == (INT64) y); /* no overflow handling yet */
+ fprintf(f, "%s %d", Comma(t), y);
+
+ } while (++t < MAX_ENTROPY_TOKENS);
+
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+
+ fprintf(f, "\n }");
+
+ } while (++band < COEF_BANDS);
+
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES_8X8);
+ fprintf(f, "\n};\n");
+
+ fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"
+ "[BLOCK_TYPES_16X16] [COEF_BANDS]"
+ "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ fprintf(f, "%s\n {", Comma(pt));
+ t = 0;
+ do {
+ const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
+ const int y = (int) x;
+
+ assert(x == (INT64) y); /* no overflow handling yet */
+ fprintf(f, "%s %d", Comma(t), y);
+
+ } while (++t < MAX_ENTROPY_TOKENS);
+
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+
+ fprintf(f, "\n }");
+
+ } while (++band < COEF_BANDS);
+
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES_16X16);
+ fprintf(f, "\n};\n");
+
+ fprintf(f, "static const vp9_prob\n"
+ "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
+ "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ unsigned int branch_ct [ENTROPY_NODES] [2];
+ unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+ vp9_prob coef_probs[ENTROPY_NODES];
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ coef_counts[t] = context_counters [type] [band] [pt] [t];
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, coef_counts, 256, 1);
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+ do {
+ fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+
+ } while (++t < ENTROPY_NODES);
+
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+ fprintf(f, "\n }");
+ } while (++band < COEF_BANDS);
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES);
+ fprintf(f, "\n};\n");
+
+ fprintf(f, "static const vp9_prob\n"
+ "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
+ "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ unsigned int branch_ct [ENTROPY_NODES] [2];
+ unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+ vp9_prob coef_probs[ENTROPY_NODES];
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, coef_counts, 256, 1);
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+ do {
+ fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+ } while (++t < ENTROPY_NODES);
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+ fprintf(f, "\n }");
+ } while (++band < COEF_BANDS);
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES_8X8);
+ fprintf(f, "\n};\n");
+
+ fprintf(f, "static const vp9_prob\n"
+ "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
+ "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+ type = 0;
+ do {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+ band = 0;
+ do {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+ pt = 0;
+ do {
+ unsigned int branch_ct [ENTROPY_NODES] [2];
+ unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+ vp9_prob coef_probs[ENTROPY_NODES];
+ for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+ coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
+ vp9_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+ coef_probs, branch_ct, coef_counts, 256, 1);
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+ do {
+ fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+ } while (++t < ENTROPY_NODES);
+ fprintf(f, "}");
+ } while (++pt < PREV_COEF_CONTEXTS);
+ fprintf(f, "\n }");
+ } while (++band < COEF_BANDS);
+ fprintf(f, "\n }");
+ } while (++type < BLOCK_TYPES_16X16);
+ fprintf(f, "\n};\n");
+
+ fclose(f);
+
+ f = fopen("context.bin", "wb");
+ fwrite(context_counters, sizeof(context_counters), 1, f);
+ fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+ fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+ fclose(f);
+}
+#endif
+
+void vp9_tokenize_initialize() {
+ fill_value_tokens();
+}
+
+static __inline void stuff_b(VP9_COMP *cpi,
+ MACROBLOCKD *xd,
+ const BLOCKD * const b,
+ TOKENEXTRA **tp,
+ PLANE_TYPE type,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ TX_SIZE tx_size,
+ int dry_run) {
+ const int *bands;
+ unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+ vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+ int pt, band;
+ TOKENEXTRA *t = *tp;
+ const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+ get_tx_type(xd, b) : DCT_DCT;
+ VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ switch (tx_size) {
+ default:
+ case TX_4X4:
+ bands = vp9_coef_bands;
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts;
+ probs = cpi->common.fc.hybrid_coef_probs;
+ } else {
+ counts = cpi->coef_counts;
+ probs = cpi->common.fc.coef_probs;
+ }
+ break;
+ case TX_8X8:
+ bands = vp9_coef_bands_8x8;
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts_8x8;
+ probs = cpi->common.fc.hybrid_coef_probs_8x8;
+ } else {
+ counts = cpi->coef_counts_8x8;
+ probs = cpi->common.fc.coef_probs_8x8;
+ }
+ break;
+ case TX_16X16:
+ bands = vp9_coef_bands_16x16;
+ if (tx_type != DCT_DCT) {
+ counts = cpi->hybrid_coef_counts_16x16;
+ probs = cpi->common.fc.hybrid_coef_probs_16x16;
+ } else {
+ counts = cpi->coef_counts_16x16;
+ probs = cpi->common.fc.coef_probs_16x16;
+ }
+ break;
+ }
+ band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = probs[type][band][pt];
+ t->skip_eob_node = 0;
+ ++t;
+ *tp = t;
+ *a = *l = 0;
+ if (!dry_run) {
+ ++counts[type][band][pt][DCT_EOB_TOKEN];
+ }
+}
+
+static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
+ TOKENEXTRA **t, int dry_run) {
+ ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+ PLANE_TYPE plane_type;
+ int b;
+ const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV);
+
+ if (has_y2_block) {
+ stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
+ A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
+ TX_8X8, dry_run);
+ plane_type = PLANE_TYPE_Y_NO_DC;
+ } else {
+ plane_type = PLANE_TYPE_Y_WITH_DC;
+ }
+
+ for (b = 0; b < 16; b += 4) {
+ stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],
+ L + vp9_block2left_8x8[b], TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+
+ for (b = 16; b < 24; b += 4) {
+ stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+ A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+ TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+}
+
+static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
+ TOKENEXTRA **t, int dry_run) {
+ ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
+ int b;
+
+ stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);
+ A[1] = A[2] = A[3] = A[0];
+ L[1] = L[2] = L[3] = L[0];
+ for (b = 16; b < 24; b += 4) {
+ stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+ L + vp9_block2above_8x8[b], TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+ vpx_memset(&A[8], 0, sizeof(A[8]));
+ vpx_memset(&L[8], 0, sizeof(L[8]));
+}
+
+static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
+ TOKENEXTRA **t, int dry_run) {
+ ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+ int b;
+ PLANE_TYPE plane_type;
+ const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV);
+
+ if (has_y2_block) {
+ stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],
+ L + vp9_block2left[24], TX_4X4, dry_run);
+ plane_type = PLANE_TYPE_Y_NO_DC;
+ } else {
+ plane_type = PLANE_TYPE_Y_WITH_DC;
+ }
+
+ for (b = 0; b < 16; b++)
+ stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],
+ L + vp9_block2left[b], TX_4X4, dry_run);
+
+ for (b = 16; b < 24; b++)
+ stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+ L + vp9_block2left[b], TX_4X4, dry_run);
+}
+
+static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
+ TOKENEXTRA **t, int dry_run) {
+ ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
+ int b;
+
+ for (b = 0; b < 16; b += 4) {
+ stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,
+ A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
+ TX_8X8, dry_run);
+ A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
+ L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]];
+ }
+
+ for (b = 16; b < 24; b++)
+ stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+ L + vp9_block2left[b], TX_4X4, dry_run);
+}
+
+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+ TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+ TOKENEXTRA * const t_backup = *t;
+
+ if (tx_size == TX_16X16) {
+ stuff_mb_16x16(cpi, xd, t, dry_run);
+ } else if (tx_size == TX_8X8) {
+ if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+ xd->mode_info_context->mbmi.mode == SPLITMV) {
+ stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
+ } else {
+ stuff_mb_8x8(cpi, xd, t, dry_run);
+ }
+ } else {
+ stuff_mb_4x4(cpi, xd, t, dry_run);
+ }
+
+ if (dry_run) {
+ *t = t_backup;
+ }
+}
+
+void vp9_fix_contexts(MACROBLOCKD *xd) {
+ /* Clear entropy contexts for Y2 blocks */
+ if ((xd->mode_info_context->mbmi.mode != B_PRED
+ && xd->mode_info_context->mbmi.mode != I8X8_PRED
+ && xd->mode_info_context->mbmi.mode != SPLITMV)
+ || xd->mode_info_context->mbmi.txfm_size == TX_16X16
+ ) {
+ vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ } else {
+ vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
+ }
+}
--- /dev/null
+++ b/vp9/encoder/tokenize.h
@@ -1,0 +1,59 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "vp9/common/entropy.h"
+#include "block.h"
+
+void vp9_tokenize_initialize();
+
+typedef struct {
+ short Token;
+ short Extra;
+} TOKENVALUE;
+
+typedef struct {
+ const vp9_prob *context_tree;
+ short Extra;
+ unsigned char Token;
+ unsigned char skip_eob_node;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
+extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
+extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
+extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
+extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#endif
+
+extern const int *vp9_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ * improve cache locality, since it's needed for costing when the rest of the
+ * fields are not.
+ */
+extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+
+#endif /* tokenize_h */
--- /dev/null
+++ b/vp9/encoder/treewriter.c
@@ -1,0 +1,39 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+ int *const C,
+ vp9_tree T,
+ const vp9_prob *const P,
+ int i,
+ int c
+) {
+ const vp9_prob p = P [i >> 1];
+
+ do {
+ const vp9_tree_index j = T[i];
+ const int d = c + vp9_cost_bit(p, i & 1);
+
+ if (j <= 0)
+ C[-j] = d;
+ else
+ cost(C, T, P, j, d);
+ } while (++i & 1);
+}
+void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
+ cost(c, t, p, 0, 0);
+}
+
+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
+ cost(c, t, p, 2, 0);
+}
--- /dev/null
+++ b/vp9/encoder/treewriter.h
@@ -1,0 +1,108 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+ bit coder. Timothy S Murphy 11 October 2004 */
+
+#include "vp9/common/treecoder.h"
+
+#include "boolhuff.h" /* for now */
+
+typedef BOOL_CODER vp9_writer;
+
+#define vp9_write encode_bool
+#define vp9_write_literal vp9_encode_value
+#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp9_cost_zero(x) (vp9_prob_cost[x])
+#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
+
+#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static __inline unsigned int cost_branch(const unsigned int ct[2],
+ vp9_prob p) {
+ /* Imitate existing calculation */
+ return ((ct[0] * vp9_cost_zero(p))
+ + (ct[1] * vp9_cost_one(p))) >> 8;
+}
+
+static __inline unsigned int cost_branch256(const unsigned int ct[2],
+ vp9_prob p) {
+ /* Imitate existing calculation */
+ return ((ct[0] * vp9_cost_zero(p))
+ + (ct[1] * vp9_cost_one(p)));
+}
+
+/* Small functions to write explicit values and tokens, as well as
+ estimate their lengths. */
+
+static __inline void treed_write(vp9_writer *const w,
+ vp9_tree t,
+ const vp9_prob *const p,
+ int v,
+ /* number of bits in v, assumed nonzero */
+ int n) {
+ vp9_tree_index i = 0;
+
+ do {
+ const int b = (v >> --n) & 1;
+ vp9_write(w, b, p[i >> 1]);
+ i = t[i + b];
+ } while (n);
+}
+
+static __inline void write_token(vp9_writer *const w,
+ vp9_tree t,
+ const vp9_prob *const p,
+ vp9_token *const x) {
+ treed_write(w, t, p, x->value, x->Len);
+}
+
+static __inline int treed_cost(vp9_tree t,
+ const vp9_prob *const p,
+ int v,
+ /* number of bits in v, assumed nonzero */
+ int n) {
+ int c = 0;
+ vp9_tree_index i = 0;
+
+ do {
+ const int b = (v >> --n) & 1;
+ c += vp9_cost_bit(p[i >> 1], b);
+ i = t[i + b];
+ } while (n);
+
+ return c;
+}
+
+static __inline int cost_token(vp9_tree t,
+ const vp9_prob *const p,
+ vp9_token *const x) {
+ return treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
+
+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
+
+#endif
--- /dev/null
+++ b/vp9/encoder/variance.h
@@ -1,0 +1,84 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int max_sad);
+
+typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int n);
+
+typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array);
+
+typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array);
+
+typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char * const ref_ptr[],
+ int ref_stride, unsigned int *sad_array);
+
+typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ int xoffset,
+ int yoffset,
+ const unsigned char *ref_ptr,
+ int Refstride,
+ unsigned int *sse);
+
+typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
+ int rp, unsigned long *sum_s,
+ unsigned long *sum_r, unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr);
+
+typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
+
+typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride);
+
+typedef struct variance_vtable {
+ vp9_sad_fn_t sdf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
+ vp9_copy32xn_fn_t copymem;
+} vp9_variance_fn_ptr_t;
+
+#endif
--- /dev/null
+++ b/vp9/encoder/variance_c.c
@@ -1,0 +1,540 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "vp9/common/filter.h"
+
+
+unsigned int vp9_get_mb_ss_c(const short *src_ptr) {
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < 256; i++) {
+ sum += (src_ptr[i] * src_ptr[i]);
+ }
+
+ return sum;
+}
+
+
+static void variance(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ int w,
+ int h,
+ unsigned int *sse,
+ int *sum) {
+ int i, j;
+ int diff;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 10));
+}
+#endif
+
+unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+}
+
+unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+}
+
+
+unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 6));
+}
+
+unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+}
+
+
+unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return var;
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement first-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ // Apply bilinear filter
+ output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp9_filter[1]) +
+ (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp9_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement second-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP9_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter) {
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ // Apply filter
+ Temp = ((int)src_ptr[0] * vp9_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp9_filter[1]) +
+ (VP9_FILTER_WEIGHT / 2);
+ output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+
+unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+ unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ // First filter 1d Horizontal
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+ // Now filter Verticaly
+ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
+
+ return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+ return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+ return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+ unsigned char temp2[36 * 32];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+ return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
+
+unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+ ref_ptr, recon_stride, sse);
+}
+#endif
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+ ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
+ ref_ptr, recon_stride, sse);
+}
+#endif
+
+unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
+ ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+ ref_ptr, recon_stride, sse);
+}
+#endif
+
+unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
+ xoffset, yoffset, dst_ptr,
+ dst_pixels_per_line, sse);
+ return *sse;
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
+ xoffset, yoffset, dst_ptr,
+ dst_pixels_per_line, sse);
+ return *sse;
+}
+#endif
+
+unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[16 * 9]; // Temp data bufffer used in filtering
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+ return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[9 * 16]; // Temp data bufffer used in filtering
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
+ 1, 17, 8, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+ return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#if CONFIG_NEWBESTREFMV
+unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
+ const int source_stride,
+ const unsigned char *ref_ptr,
+ const int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 5));
+}
+
+unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
+ const int source_stride,
+ const unsigned char *ref_ptr,
+ const int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 5));
+}
+
+unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char *src_ptr,
+ const int src_pixels_per_line,
+ const int xoffset,
+ const int yoffset,
+ const unsigned char *dst_ptr,
+ const int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[16 * 3]; // Temp data bufffer used in filtering
+ unsigned char temp2[20 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3,
+ src_pixels_per_line, 1, 3, 16, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
+
+ return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char *src_ptr,
+ const int src_pixels_per_line,
+ const int xoffset,
+ const int yoffset,
+ const unsigned char *dst_ptr,
+ const int dst_pixels_per_line,
+ unsigned int *sse) {
+ unsigned short FData3[2 * 17]; // Temp data bufffer used in filtering
+ unsigned char temp2[2 * 16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp9_bilinear_filters[xoffset];
+ VFilter = vp9_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3,
+ src_pixels_per_line, 1, 17, 2, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
+
+ return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
--- /dev/null
+++ b/vp9/encoder/x86/dct_mmx.asm
@@ -1,0 +1,241 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp9_short_fdct4x4_mmx)
+sym(vp9_short_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
+
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax]
+
+ movq mm2, [rcx]
+ movq mm4, [rcx + rax]
+
+ ; transpose for the first stage
+ movq mm3, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 20 21 22 23
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm3, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm4 ; 20 30 21 31
+ punpckhwd mm5, mm4 ; 22 32 23 33
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm3 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm3, mm5 ; 03 13 23 33
+
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 3
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
+
+ psubw mm4, mm2 ; c1 = 1 - 2
+ psubw mm5, mm3 ; d1 = 0 - 3
+
+ psllw mm5, 3
+ psllw mm4, 3
+
+ psllw mm0, 3
+ psllw mm1, 3
+
+ ; output 0 and 2
+ movq mm2, mm0 ; a1
+
+ paddw mm0, mm1 ; op[0] = a1 + b1
+ psubw mm2, mm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm4 ; c1 d1
+ punpckhwd mm5, mm4 ; c1 d1
+
+ movq mm3, mm1
+ movq mm4, mm5
+
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd mm1, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm4, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm3, MMWORD PTR[GLOBAL(_7500)]
+ paddd mm5, MMWORD PTR[GLOBAL(_7500)]
+
+ psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw mm1, mm4 ; op[1]
+ packssdw mm3, mm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 10 20 30
+ movq mm5, mm2 ; 02 12 22 32
+
+ punpcklwd mm0, mm1 ; 00 01 10 11
+ punpckhwd mm4, mm1 ; 20 21 30 31
+
+ punpcklwd mm2, mm3 ; 02 03 12 13
+ punpckhwd mm5, mm3 ; 22 23 32 33
+
+ movq mm1, mm0 ; 00 01 10 11
+ punpckldq mm0, mm2 ; 00 01 02 03
+
+ punpckhdq mm1, mm2 ; 01 22 12 13
+
+ movq mm2, mm4 ; 20 31 30 31
+ punpckldq mm2, mm5 ; 20 21 22 23
+
+ punpckhdq mm4, mm5 ; 30 31 32 33
+
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 4
+
+ movq mm5, mm0
+ movq mm3, mm1
+
+ paddw mm0, mm4 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
+
+ psubw mm3, mm2 ; c1 = 1 - 2
+ psubw mm5, mm4 ; d1 = 0 - 3
+
+ pxor mm6, mm6 ; zero out for compare
+
+ pcmpeqw mm6, mm5 ; d1 != 0
+
+ pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movq mm2, mm0 ; a1
+
+ paddw mm0, mm1 ; a1 + b1
+ psubw mm2, mm1 ; a1 - b1
+
+ paddw mm0, MMWORD PTR[GLOBAL(_7w)]
+ paddw mm2, MMWORD PTR[GLOBAL(_7w)]
+
+ psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ movq MMWORD PTR[rdi + 0 ], mm0
+ movq MMWORD PTR[rdi + 16], mm2
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm3 ; c1 d1
+ punpckhwd mm5, mm3 ; c1 d1
+
+ movq mm3, mm1
+ movq mm4, mm5
+
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd mm1, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm4, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm3, MMWORD PTR[GLOBAL(_51000)]
+ paddd mm5, MMWORD PTR[GLOBAL(_51000)]
+
+ psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw mm1, mm4 ; op[4]
+ packssdw mm3, mm5 ; op[12]
+
+ paddw mm1, mm6 ; op[4] += (d1!=0)
+
+ movq MMWORD PTR[rdi + 8 ], mm1
+ movq MMWORD PTR[rdi + 24], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 8
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 8
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 8
+_cmp_mask:
+ times 4 dw 1
+align 8
+_7w:
+ times 4 dw 7
+align 8
+_14500:
+ times 2 dd 14500
+align 8
+_7500:
+ times 2 dd 7500
+align 8
+_12000:
+ times 2 dd 12000
+align 8
+_51000:
+ times 2 dd 51000
--- /dev/null
+++ b/vp9/encoder/x86/dct_sse2.asm
@@ -1,0 +1,432 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+ %define input rsi
+ %define output rdi
+ %define pitch rax
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+
+ movsxd rax, dword ptr arg(2)
+ lea rcx, [rsi + rax*2]
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define input rcx
+ %define output rdx
+ %define pitch r8
+ SAVE_XMM 7, u
+ %else
+ %define input rdi
+ %define output rsi
+ %define pitch rdx
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+ %define input
+ %define output
+ %define pitch
+
+%if ABI_IS_32BIT
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_fdct4x4_sse2)
+sym(vp9_short_fdct4x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ movq xmm0, MMWORD PTR[input ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
+ lea input, [input+2*pitch]
+ movq xmm1, MMWORD PTR[input ] ;23 22 21 20
+ movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
+
+ punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
+ punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
+ punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
+ pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
+ pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
+
+ punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
+ psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
+ psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
+ psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+ paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
+ psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm0, xmm1 ;op[2] op[0]
+ packssdw xmm3, xmm4 ;op[3] op[1]
+ ; 23 22 21 20 03 02 01 00
+ ;
+ ; 33 32 31 30 13 12 11 10
+ ;
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
+
+ movdqa xmm3, xmm0
+ punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
+ punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
+ pshufd xmm2, xmm2, 04eh
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
+ psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
+
+ pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
+ movdqa xmm2, xmm3 ;save d1 for compare
+ pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
+ pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
+ pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
+ pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
+ pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+ pxor xmm4, xmm4 ;zero out for compare
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ pcmpeqw xmm2, xmm4
+ psrad xmm0, 4 ;(a1 + b1 + 7)>>4
+ psrad xmm1, 4 ;(a1 - b1 + 7)>>4
+ pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+ ;and keep bit 0 of lower
+
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+ paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
+ packssdw xmm0, xmm1 ;op[8] op[0]
+ psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
+ psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
+
+ packssdw xmm3, xmm4 ;op[12] op[4]
+ movdqa xmm1, xmm0
+ paddw xmm3, xmm2 ;op[4] += (d1!=0)
+ punpcklqdq xmm0, xmm3 ;op[4] op[0]
+ punpckhqdq xmm1, xmm3 ;op[12] op[8]
+
+ movdqa XMMWORD PTR[output + 0], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm1
+
+ STACK_FRAME_DESTROY
+
+;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_fdct8x4_sse2)
+sym(vp9_short_fdct8x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ ; read the input data
+ movdqa xmm0, [input ]
+ movdqa xmm2, [input+ pitch]
+ lea input, [input+2*pitch]
+ movdqa xmm4, [input ]
+ movdqa xmm3, [input+ pitch]
+
+ ; transpose for the first stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm2 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ psllw xmm5, 3
+ psllw xmm4, 3
+
+ psllw xmm0, 3
+ psllw xmm1, 3
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; op[0] = a1 + b1
+ psubw xmm2, xmm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
+
+ psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm1, xmm4 ; op[1]
+ packssdw xmm3, xmm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
+
+ movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
+
+ ; xmm0 0
+ ; xmm1 4
+ ; xmm2 1
+ ; xmm3 3
+
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm4 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ pxor xmm6, xmm6 ; zero out for compare
+
+ pcmpeqw xmm6, xmm5 ; d1 != 0
+
+ pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; a1 + b1
+ psubw xmm2, xmm1 ; a1 - b1
+
+ paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
+ paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
+
+ psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
+
+ psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw xmm1, xmm4 ; op[4]
+ packssdw xmm3, xmm5 ; op[12]
+
+ paddw xmm1, xmm6 ; op[4] += (d1!=0)
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm4, xmm1
+
+ punpcklqdq xmm2, xmm3
+ punpckhqdq xmm5, xmm3
+
+ movdqa XMMWORD PTR[output + 0 ], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm2
+ movdqa XMMWORD PTR[output + 32], xmm4
+ movdqa XMMWORD PTR[output + 48], xmm5
+
+ STACK_FRAME_DESTROY
+
+SECTION_RODATA
+align 16
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 16
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 16
+_mult_add:
+ times 8 dw 1
+align 16
+_cmp_mask:
+ times 4 dw 1
+ times 4 dw 0
+align 16
+_cmp_mask8x4:
+ times 8 dw 1
+align 16
+_mult_sub:
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+align 16
+_7:
+ times 4 dd 7
+align 16
+_7w:
+ times 8 dw 7
+align 16
+_14500:
+ times 4 dd 14500
+align 16
+_7500:
+ times 4 dd 7500
+align 16
+_12000:
+ times 4 dd 12000
+align 16
+_51000:
+ times 4 dd 51000
--- /dev/null
+++ b/vp9/encoder/x86/encodeopt.asm
@@ -1,0 +1,386 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp9_block_error_xmm)
+sym(vp9_block_error_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prologue
+
+ mov rsi, arg(0) ;coeff_ptr
+ mov rdi, arg(1) ;dcoef_ptr
+
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
+
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
+
+ psubw xmm0, xmm1
+ psubw xmm2, xmm3
+
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm2, xmm2
+
+ paddd xmm0, xmm2
+
+ pxor xmm5, xmm5
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ psrldq xmm0, 8
+ paddd xmm0, xmm1
+
+ movq rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp9_block_error_mmx)
+sym(vp9_block_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movq mm3, [rsi]
+
+ movq mm4, [rdi]
+ movq mm5, [rsi+8]
+
+ movq mm6, [rdi+8]
+ pxor mm1, mm1 ; from movd mm1, dc ; dc =0
+
+ movq mm2, mm7
+ psubw mm5, mm6
+
+ por mm1, mm2
+ pmaddwd mm5, mm5
+
+ pcmpeqw mm1, mm7
+ psubw mm3, mm4
+
+ pand mm1, mm3
+ pmaddwd mm1, mm1
+
+ paddd mm1, mm5
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm3, mm5
+
+ paddd mm1, mm3
+ movq mm0, mm1
+
+ psrlq mm1, 32
+ paddd mm0, mm1
+
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp9_mbblock_error_mmx_impl)
+sym(vp9_mbblock_error_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor mm2, mm2
+
+ movd mm1, dword ptr arg(2) ;dc
+ por mm1, mm2
+
+ pcmpeqw mm1, mm7
+ mov rcx, 16
+
+.mberror_loop_mmx:
+ movq mm3, [rsi]
+ movq mm4, [rdi]
+
+ movq mm5, [rsi+8]
+ movq mm6, [rdi+8]
+
+
+ psubw mm5, mm6
+ pmaddwd mm5, mm5
+
+ psubw mm3, mm4
+ pand mm3, mm1
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ add rsi, 32
+
+ add rdi, 32
+ sub rcx, 1
+
+ jnz .mberror_loop_mmx
+
+ movq mm0, mm2
+ psrlq mm2, 32
+
+ paddd mm0, mm2
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp9_mbblock_error_xmm_impl)
+sym(vp9_mbblock_error_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm6, xmm6
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor xmm4, xmm4
+
+ movd xmm5, dword ptr arg(2) ;dc
+ por xmm5, xmm4
+
+ pcmpeqw xmm5, xmm6
+ mov rcx, 16
+
+.mberror_loop:
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
+
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
+
+
+ psubw xmm2, xmm3
+ pmaddwd xmm2, xmm2
+
+ psubw xmm0, xmm1
+ pand xmm0, xmm5
+
+ pmaddwd xmm0, xmm0
+ add rsi, 32
+
+ add rdi, 32
+
+ sub rcx, 1
+ paddd xmm4, xmm2
+
+ paddd xmm4, xmm0
+ jnz .mberror_loop
+
+ movdqa xmm0, xmm4
+ punpckldq xmm0, xmm6
+
+ punpckhdq xmm4, xmm6
+ paddd xmm0, xmm4
+
+ movdqa xmm1, xmm0
+ psrldq xmm0, 8
+
+ paddd xmm0, xmm1
+ movq rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp9_mbuverror_mmx_impl)
+sym(vp9_mbuverror_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor mm7, mm7
+
+.mbuverror_loop_mmx:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ psubw mm1, mm2
+ pmaddwd mm1, mm1
+
+
+ movq mm3, [rsi+8]
+ movq mm4, [rdi+8]
+
+ psubw mm3, mm4
+ pmaddwd mm3, mm3
+
+
+ paddd mm7, mm1
+ paddd mm7, mm3
+
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz .mbuverror_loop_mmx
+
+ movq mm0, mm7
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp9_mbuverror_xmm_impl)
+sym(vp9_mbuverror_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor xmm3, xmm3
+
+.mbuverror_loop:
+
+ movdqa xmm1, [rsi]
+ movdqa xmm2, [rdi]
+
+ psubw xmm1, xmm2
+ pmaddwd xmm1, xmm1
+
+ paddd xmm3, xmm1
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz .mbuverror_loop
+
+ pxor xmm0, xmm0
+ movdqa xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ paddd xmm1, xmm2
+
+ movdqa xmm2, xmm1
+
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ movq rax, xmm1
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/fwalsh_sse2.asm
@@ -1,0 +1,164 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp9_short_walsh4x4_sse2)
+sym(vp9_short_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
+ movsxd rdx, dword ptr arg(2) ; pitch
+
+ ; first for loop
+ movq xmm0, MMWORD PTR [rsi] ; load input
+ movq xmm1, MMWORD PTR [rsi + rdx]
+ lea rsi, [rsi + rdx*2]
+ movq xmm2, MMWORD PTR [rsi]
+ movq xmm3, MMWORD PTR [rsi + rdx]
+
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ; ip[1] ip[0]
+ punpckhdq xmm1, xmm2 ; ip[3] ip[2]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ psllw xmm0, 2 ; d1 a1
+ psllw xmm2, 2 ; c1 b1
+
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2 ; b1 a1
+ punpckhqdq xmm1, xmm2 ; c1 d1
+
+ pxor xmm6, xmm6
+ movq xmm6, xmm0
+ pxor xmm7, xmm7
+ pcmpeqw xmm7, xmm6
+ paddw xmm7, [GLOBAL(c1)]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1 ; b1+c1 a1+d1
+ psubw xmm2, xmm1 ; b1-c1 a1-d1
+ paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
+
+ ; second for loop
+ ; input: 13 9 5 1 12 8 4 0 (xmm0)
+ ; 14 10 6 2 15 11 7 3 (xmm2)
+ ; after shuffle:
+ ; 13 5 9 1 12 4 8 0 (xmm0)
+ ; 14 6 10 2 15 7 11 3 (xmm1)
+ pshuflw xmm3, xmm0, 0xd8
+ pshufhw xmm0, xmm3, 0xd8
+ pshuflw xmm3, xmm2, 0xd8
+ pshufhw xmm1, xmm3, 0xd8
+
+ movdqa xmm2, xmm0
+ pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
+ pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
+ movdqa xmm3, xmm1
+ pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
+ pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
+
+ pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
+ pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
+ pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
+ pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
+
+ movdqa xmm0, xmm4
+ punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
+ punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
+ movdqa xmm1, xmm6
+ punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
+ punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
+
+ movdqa xmm2, xmm0
+ paddd xmm0, xmm4 ; b21 b20 a21 a20
+ psubd xmm2, xmm4 ; c21 c20 d21 d20
+ movdqa xmm3, xmm1
+ paddd xmm1, xmm6 ; b23 b22 a23 a22
+ psubd xmm3, xmm6 ; c23 c22 d23 d22
+
+ pxor xmm4, xmm4
+ movdqa xmm5, xmm4
+ pcmpgtd xmm4, xmm0
+ pcmpgtd xmm5, xmm2
+ pand xmm4, [GLOBAL(cd1)]
+ pand xmm5, [GLOBAL(cd1)]
+
+ pxor xmm6, xmm6
+ movdqa xmm7, xmm6
+ pcmpgtd xmm6, xmm1
+ pcmpgtd xmm7, xmm3
+ pand xmm6, [GLOBAL(cd1)]
+ pand xmm7, [GLOBAL(cd1)]
+
+ paddd xmm0, xmm4
+ paddd xmm2, xmm5
+ paddd xmm0, [GLOBAL(cd3)]
+ paddd xmm2, [GLOBAL(cd3)]
+ paddd xmm1, xmm6
+ paddd xmm3, xmm7
+ paddd xmm1, [GLOBAL(cd3)]
+ paddd xmm3, [GLOBAL(cd3)]
+
+ psrad xmm0, 3
+ psrad xmm1, 3
+ psrad xmm2, 3
+ psrad xmm3, 3
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
+ punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
+ movdqa xmm5, xmm2
+ punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
+ punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
+
+ packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
+ packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm2
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+c1:
+ dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+ dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+ dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
--- /dev/null
+++ b/vp9/encoder/x86/mcomp_x86.h
@@ -1,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sadx3
+
+#undef vp9_search_refining_search
+#define vp9_search_refining_search vp9_refining_search_sadx4
+
+#undef vp9_search_diamond_search
+#define vp9_search_diamond_search vp9_diamond_search_sadx4
+
+#endif
+#endif
+
+#if HAVE_SSE4_1
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_search_full_search
+#define vp9_search_full_search vp9_full_search_sadx8
+
+#endif
+#endif
+
+#endif
+
--- /dev/null
+++ b/vp9/encoder/x86/quantize_mmx.asm
@@ -1,0 +1,286 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp9_fast_quantize_b_impl_mmx)
+sym(vp9_fast_quantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movq mm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm1, [rax]
+
+ movq mm3, mm0
+ psraw mm0, 15
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ; abs
+
+ movq mm2, mm3
+ pcmpgtw mm1, mm2
+
+ pandn mm1, mm2
+ movq mm3, mm1
+
+ mov rdx, arg(6) ;quant_ptr
+ movq mm1, [rdx]
+
+ mov rcx, arg(5) ;round_ptr
+ movq mm2, [rcx]
+
+ paddw mm3, mm2
+ pmulhuw mm3, mm1
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movq mm0, mm3
+
+ movq [rdi], mm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm2, [rax]
+
+ pmullw mm3, mm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax], mm3
+
+ ; next 8
+ movq mm4, [rsi+8]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+8]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+8]
+ movq mm6, [rcx+8]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+8], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+8]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+8], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+16]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+16]
+ movq mm6, [rcx+16]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+16], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+16]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+16], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+24]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+24]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+24]
+ movq mm6, [rcx+24]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+24], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+24]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+24], mm7
+
+
+
+ mov rdi, arg(4) ;scan_mask
+ mov rsi, arg(2) ;qcoeff_ptr
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rsi+8]
+
+ movq mm2, [rdi]
+ movq mm3, [rdi+8];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ movq mm5, mm0
+
+ paddd mm5, mm1
+
+ movq mm0, [rsi+16]
+ movq mm1, [rsi+24]
+
+ movq mm2, [rdi+16]
+ movq mm3, [rdi+24];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ paddd mm5, mm0
+
+ paddd mm5, mm1
+ movq mm0, mm5
+
+ psrlq mm5, 32
+ paddd mm0, mm5
+
+ ; eob adjustment begins here
+ movq rcx, mm0
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx ; rdx=-rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+ ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+ ; following is kept as reference
+ ; movq rcx, mm0
+ ; bsr rax, rcx
+ ;
+ ; mov eob, rax
+ ; mov eee, rcx
+ ;
+ ;if(eee==0)
+ ;{
+ ; eob=-1;
+ ;}
+ ;else if(eee<0)
+ ;{
+ ; eob=15;
+ ;}
+ ;d->eob = eob+1;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/quantize_sse2.asm
@@ -1,0 +1,380 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_regular_quantize_b_sse2 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp9_regular_quantize_b_sse2)
+sym(vp9_regular_quantize_b_sse2):
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 7
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %endif
+%endif
+
+ ALIGN_STACK 16, rax
+ %define zrun_zbin_boost 0 ; 8
+ %define abs_minus_zbin 8 ; 32
+ %define temp_qcoeff 40 ; 32
+ %define qcoeff 72 ; 32
+ %define stack_size 104
+ sub rsp, stack_size
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rdx, [rdi + vp9_block_coeff] ; coeff_ptr
+ mov rcx, [rdi + vp9_block_zbin] ; zbin_ptr
+ movd xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
+
+ ; z
+ movdqa xmm0, [rdx]
+ movdqa xmm4, [rdx + 16]
+ mov rdx, [rdi + vp9_block_round] ; round_ptr
+
+ pshuflw xmm7, xmm7, 0
+ punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ ; (z ^ sz)
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+
+ ; x = abs(z)
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+ mov rcx, [rdi + vp9_block_quant] ; quant_ptr
+
+ ; *zbin_ptr + zbin_oq_value
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ ; x - (*zbin_ptr + zbin_oq_value)
+ psubw xmm1, xmm2
+ psubw xmm5, xmm3
+ movdqa [rsp + abs_minus_zbin], xmm1
+ movdqa [rsp + abs_minus_zbin + 16], xmm5
+
+ ; add (zbin_ptr + zbin_oq_value) back
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ movdqa xmm2, [rdx]
+ movdqa xmm6, [rdx + 16]
+
+ movdqa xmm3, [rcx]
+ movdqa xmm7, [rcx + 16]
+
+ ; x + round
+ paddw xmm1, xmm2
+ paddw xmm5, xmm6
+
+ ; y = x * quant_ptr >> 16
+ pmulhw xmm3, xmm1
+ pmulhw xmm7, xmm5
+
+ ; y += x
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+
+ movdqa [rsp + temp_qcoeff], xmm1
+ movdqa [rsp + temp_qcoeff + 16], xmm5
+
+ pxor xmm6, xmm6
+ ; zero qcoeff
+ movdqa [rsp + qcoeff], xmm6
+ movdqa [rsp + qcoeff + 16], xmm6
+
+ mov rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
+ mov rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
+ mov [rsp + zrun_zbin_boost], rdx
+
+%macro ZIGZAG_LOOP 1
+ ; x
+ movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
+
+ ; if (x >= zbin)
+ sub cx, WORD PTR[rdx] ; x - zbin
+ lea rdx, [rdx + 2] ; zbin_boost_ptr++
+ jl .rq_zigzag_loop_%1 ; x < zbin
+
+ movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
+
+ ; downshift by quant_shift[rc]
+ movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
+ sar edi, cl ; also sets Z bit
+ je .rq_zigzag_loop_%1 ; !y
+ mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+ mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
+ZIGZAG_LOOP 0
+ZIGZAG_LOOP 1
+ZIGZAG_LOOP 4
+ZIGZAG_LOOP 8
+ZIGZAG_LOOP 5
+ZIGZAG_LOOP 2
+ZIGZAG_LOOP 3
+ZIGZAG_LOOP 6
+ZIGZAG_LOOP 9
+ZIGZAG_LOOP 12
+ZIGZAG_LOOP 13
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP 7
+ZIGZAG_LOOP 11
+ZIGZAG_LOOP 14
+ZIGZAG_LOOP 15
+
+ movdqa xmm2, [rsp + qcoeff]
+ movdqa xmm3, [rsp + qcoeff + 16]
+
+ mov rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
+ mov rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
+
+ ; y ^ sz
+ pxor xmm2, xmm0
+ pxor xmm3, xmm4
+ ; x = (y ^ sz) - sz
+ psubw xmm2, xmm0
+ psubw xmm3, xmm4
+
+ ; dequant
+ movdqa xmm0, [rcx]
+ movdqa xmm1, [rcx + 16]
+
+ mov rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
+
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ movdqa [rcx], xmm2 ; store qcoeff
+ movdqa [rcx + 16], xmm3
+ movdqa [rdi], xmm0 ; store dqcoeff
+ movdqa [rdi + 16], xmm1
+
+ ; select the last value (in zig_zag order) for EOB
+ pcmpeqw xmm2, xmm6
+ pcmpeqw xmm3, xmm6
+ ; !
+ pcmpeqw xmm6, xmm6
+ pxor xmm2, xmm6
+ pxor xmm3, xmm6
+ ; mask inv_zig_zag
+ pand xmm2, [GLOBAL(inv_zig_zag)]
+ pand xmm3, [GLOBAL(inv_zig_zag + 16)]
+ ; select the max value
+ pmaxsw xmm2, xmm3
+ pshufd xmm3, xmm2, 00001110b
+ pmaxsw xmm2, xmm3
+ pshuflw xmm3, xmm2, 00001110b
+ pmaxsw xmm2, xmm3
+ pshuflw xmm3, xmm2, 00000001b
+ pmaxsw xmm2, xmm3
+ movd eax, xmm2
+ and eax, 0xff
+ mov [rsi + vp9_blockd_eob], eax
+
+ ; begin epilog
+ add rsp, stack_size
+ pop rsp
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+ RESTORE_GOT
+ RESTORE_XMM
+ pop rbp
+ ret
+
+; void vp9_fast_quantize_b_sse2 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp9_fast_quantize_b_sse2)
+sym(vp9_fast_quantize_b_sse2):
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %else
+ ; these registers are used for passing arguments
+ %endif
+%endif
+
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp9_block_coeff]
+ mov rcx, [rdi + vp9_block_round]
+ mov rdx, [rdi + vp9_block_quant_fast]
+
+ ; z = coeff
+ movdqa xmm0, [rax]
+ movdqa xmm4, [rax + 16]
+
+ ; dup z so we can save sz
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz = z >> 15
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ ; x = abs(z) = (z ^ sz) - sz
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ ; x += round
+ paddw xmm1, [rcx]
+ paddw xmm5, [rcx + 16]
+
+ mov rax, [rsi + vp9_blockd_qcoeff]
+ mov rcx, [rsi + vp9_blockd_dequant]
+ mov rdi, [rsi + vp9_blockd_dqcoeff]
+
+ ; y = x * quant >> 16
+ pmulhw xmm1, [rdx]
+ pmulhw xmm5, [rdx + 16]
+
+ ; x = (y ^ sz) - sz
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ ; qcoeff = x
+ movdqa [rax], xmm1
+ movdqa [rax + 16], xmm5
+
+ ; x * dequant
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm5
+ pmullw xmm2, [rcx]
+ pmullw xmm3, [rcx + 16]
+
+ ; dqcoeff = x * dequant
+ movdqa [rdi], xmm2
+ movdqa [rdi + 16], xmm3
+
+ pxor xmm4, xmm4 ;clear all bits
+ pcmpeqw xmm1, xmm4
+ pcmpeqw xmm5, xmm4
+
+ pcmpeqw xmm4, xmm4 ;set all bits
+ pxor xmm1, xmm4
+ pxor xmm5, xmm4
+
+ pand xmm1, [GLOBAL(inv_zig_zag)]
+ pand xmm5, [GLOBAL(inv_zig_zag + 16)]
+
+ pmaxsw xmm1, xmm5
+
+ ; now down to 8
+ pshufd xmm5, xmm1, 00001110b
+
+ pmaxsw xmm1, xmm5
+
+ ; only 4 left
+ pshuflw xmm5, xmm1, 00001110b
+
+ pmaxsw xmm1, xmm5
+
+ ; okay, just 2!
+ pshuflw xmm5, xmm1, 00000001b
+
+ pmaxsw xmm1, xmm5
+
+ movd eax, xmm1
+ and eax, 0xff
+ mov [rsi + vp9_blockd_eob], eax
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+
+ RESTORE_GOT
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+inv_zig_zag:
+ dw 0x0001, 0x0002, 0x0006, 0x0007
+ dw 0x0003, 0x0005, 0x0008, 0x000d
+ dw 0x0004, 0x0009, 0x000c, 0x000e
+ dw 0x000a, 0x000b, 0x000f, 0x0010
--- /dev/null
+++ b/vp9/encoder/x86/quantize_sse4.asm
@@ -1,0 +1,254 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_regular_quantize_b_sse4 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp9_regular_quantize_b_sse4)
+sym(vp9_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+ push rdi
+ push rsi
+
+ ALIGN_STACK 16, rax
+ %define qcoeff 0 ; 32
+ %define stack_size 32
+ sub rsp, stack_size
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 8, u
+ push rdi
+ push rsi
+ %endif
+%endif
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp9_block_coeff]
+ mov rcx, [rdi + vp9_block_zbin]
+ mov rdx, [rdi + vp9_block_round]
+ movd xmm7, [rdi + vp9_block_zbin_extra]
+
+ ; z
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax + 16]
+
+ ; duplicate zbin_oq_value
+ pshuflw xmm7, xmm7, 0
+ punpcklwd xmm7, xmm7
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ ; sz
+ psraw xmm0, 15
+ psraw xmm1, 15
+
+ ; (z ^ sz)
+ pxor xmm2, xmm0
+ pxor xmm3, xmm1
+
+ ; x = abs(z)
+ psubw xmm2, xmm0
+ psubw xmm3, xmm1
+
+ ; zbin
+ movdqa xmm4, [rcx]
+ movdqa xmm5, [rcx + 16]
+
+ ; *zbin_ptr + zbin_oq_value
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+ ; x - (*zbin_ptr + zbin_oq_value)
+ psubw xmm6, xmm4
+ psubw xmm7, xmm5
+
+ ; round
+ movdqa xmm4, [rdx]
+ movdqa xmm5, [rdx + 16]
+
+ mov rax, [rdi + vp9_block_quant_shift]
+ mov rcx, [rdi + vp9_block_quant]
+ mov rdx, [rdi + vp9_block_zrun_zbin_boost]
+
+ ; x + round
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ ; quant
+ movdqa xmm4, [rcx]
+ movdqa xmm5, [rcx + 16]
+
+ ; y = x * quant_ptr >> 16
+ pmulhw xmm4, xmm2
+ pmulhw xmm5, xmm3
+
+ ; y += x
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ pxor xmm4, xmm4
+%if ABI_IS_32BIT
+ movdqa [rsp + qcoeff], xmm4
+ movdqa [rsp + qcoeff + 16], xmm4
+%else
+ pxor xmm8, xmm8
+%endif
+
+ ; quant_shift
+ movdqa xmm5, [rax]
+
+ ; zrun_zbin_boost
+ mov rax, rdx
+
+%macro ZIGZAG_LOOP 5
+ ; x
+ pextrw ecx, %4, %2
+
+ ; if (x >= zbin)
+ sub cx, WORD PTR[rdx] ; x - zbin
+ lea rdx, [rdx + 2] ; zbin_boost_ptr++
+ jl .rq_zigzag_loop_%1 ; x < zbin
+
+ pextrw edi, %3, %2 ; y
+
+ ; downshift by quant_shift[rc]
+ pextrb ecx, xmm5, %1 ; quant_shift[rc]
+ sar edi, cl ; also sets Z bit
+ je .rq_zigzag_loop_%1 ; !y
+%if ABI_IS_32BIT
+ mov WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+ pinsrw %5, edi, %2 ; qcoeff[rc]
+%endif
+ mov rdx, rax ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c
+ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+ mov rcx, [rsi + vp9_blockd_dequant]
+ mov rdi, [rsi + vp9_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+ movdqa xmm4, [rsp + qcoeff]
+ movdqa xmm5, [rsp + qcoeff + 16]
+%else
+ %define xmm5 xmm8
+%endif
+
+ ; y ^ sz
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ ; x = (y ^ sz) - sz
+ psubw xmm4, xmm0
+ psubw xmm5, xmm1
+
+ ; dequant
+ movdqa xmm0, [rcx]
+ movdqa xmm1, [rcx + 16]
+
+ mov rcx, [rsi + vp9_blockd_qcoeff]
+
+ pmullw xmm0, xmm4
+ pmullw xmm1, xmm5
+
+ ; store qcoeff
+ movdqa [rcx], xmm4
+ movdqa [rcx + 16], xmm5
+
+ ; store dqcoeff
+ movdqa [rdi], xmm0
+ movdqa [rdi + 16], xmm1
+
+ ; select the last value (in zig_zag order) for EOB
+ pxor xmm6, xmm6
+ pcmpeqw xmm4, xmm6
+ pcmpeqw xmm5, xmm6
+
+ packsswb xmm4, xmm5
+ pshufb xmm4, [GLOBAL(zig_zag1d)]
+ pmovmskb edx, xmm4
+ xor rdi, rdi
+ mov eax, -1
+ xor dx, ax
+ bsr eax, edx
+ sub edi, edx
+ sar edi, 31
+ add eax, 1
+ and eax, edi
+
+ mov [rsi + vp9_blockd_eob], eax
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ add rsp, stack_size
+ pop rsp
+
+ pop rsi
+ pop rdi
+ RESTORE_GOT
+ pop rbp
+%else
+ %undef xmm5
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ RESTORE_XMM
+ %endif
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+; vp9/common/entropy.c: vp9_default_zig_zag1d
+zig_zag1d:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- /dev/null
+++ b/vp9/encoder/x86/quantize_ssse3.asm
@@ -1,0 +1,138 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp9_fast_quantize_b_ssse3 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+;
+
+global sym(vp9_fast_quantize_b_ssse3)
+sym(vp9_fast_quantize_b_ssse3):
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %endif
+%endif
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp9_block_coeff]
+ mov rcx, [rdi + vp9_block_round]
+ mov rdx, [rdi + vp9_block_quant_fast]
+
+ ; coeff
+ movdqa xmm0, [rax]
+ movdqa xmm4, [rax + 16]
+
+ ; round
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz = z >> 15
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ pabsw xmm1, xmm1
+ pabsw xmm5, xmm5
+
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ ; quant_fast
+ pmulhw xmm1, [rdx]
+ pmulhw xmm5, [rdx + 16]
+
+ mov rax, [rsi + vp9_blockd_qcoeff]
+ mov rdi, [rsi + vp9_blockd_dequant]
+ mov rcx, [rsi + vp9_blockd_dqcoeff]
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa [rax], xmm1
+ movdqa [rax + 16], xmm5
+
+ movdqa xmm2, [rdi]
+ movdqa xmm3, [rdi + 16]
+
+ pxor xmm4, xmm4
+ pmullw xmm2, xmm1
+ pmullw xmm3, xmm5
+
+ pcmpeqw xmm1, xmm4 ;non zero mask
+ pcmpeqw xmm5, xmm4 ;non zero mask
+ packsswb xmm1, xmm5
+ pshufb xmm1, [GLOBAL(zz_shuf)]
+
+ pmovmskb edx, xmm1
+
+ xor rdi, rdi
+ mov eax, -1
+ xor dx, ax ;flip the bits for bsr
+ bsr eax, edx
+
+ movdqa [rcx], xmm2 ;store dqcoeff
+ movdqa [rcx + 16], xmm3 ;store dqcoeff
+
+ sub edi, edx ;check for all zeros in bit mask
+ sar edi, 31 ;0 or -1
+ add eax, 1
+ and eax, edi ;if the bit mask was all zero,
+ ;then eob = 0
+ mov [rsi + vp9_blockd_eob], eax
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+
+ RESTORE_GOT
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- /dev/null
+++ b/vp9/encoder/x86/quantize_x86.h
@@ -1,0 +1,48 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license and patent
+ * grant that can be found in the LICENSE file in the root of the source
+ * tree. All contributing project authors may be found in the AUTHORS
+ * file in the root of the source tree.
+ */
+
+#ifndef QUANTIZE_X86_H
+#define QUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+
+#endif /* HAVE_MMX */
+
+
+#if HAVE_SSE2
+extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_quantize_quantb
+#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE2 */
+
+
+#if HAVE_SSE4_1
+extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_quantize_quantb
+#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_SSE4_1 */
+
+#endif /* QUANTIZE_X86_H */
--- /dev/null
+++ b/vp9/encoder/x86/sad_mmx.asm
@@ -1,0 +1,427 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp9_sad16x16_mmx)
+global sym(vp9_sad8x16_mmx)
+global sym(vp9_sad8x8_mmx)
+global sym(vp9_sad4x4_mmx)
+global sym(vp9_sad16x8_mmx)
+
+;unsigned int vp9_sad16x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp9_sad16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpcklbw mm2, mm6
+
+ punpckhbw mm1, mm6
+ punpckhbw mm3, mm6
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ paddw mm7, mm1
+
+ cmp rsi, rcx
+ jne .x16x16sad_mmx_loop
+
+
+ movq mm0, mm7
+
+ punpcklwd mm0, mm6
+ punpckhwd mm7, mm6
+
+ paddw mm0, mm7
+ movq mm7, mm0
+
+
+ psrlq mm0, 32
+ paddw mm7, mm0
+
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad8x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp9_sad8x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ paddw mm7, mm2
+ cmp rsi, rcx
+
+ jne .x8x16sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad8x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp9_sad8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x8sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ cmp rsi, rcx
+
+ jne .x8x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad4x4_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp9_sad4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ pxor mm3, mm3
+
+ punpcklbw mm0, mm3
+ punpckhbw mm2, mm3
+
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm4, DWORD PTR [rsi]
+ movd mm5, DWORD PTR [rdi]
+
+ movd mm6, DWORD PTR [rsi+rax]
+ movd mm7, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+
+ movq mm6, mm4
+ psubusb mm4, mm5
+
+ psubusb mm5, mm6
+ por mm4, mm5
+
+ movq mm5, mm4
+ punpcklbw mm4, mm3
+
+ punpckhbw mm5, mm3
+ paddw mm4, mm5
+
+ paddw mm0, mm4
+ movq mm1, mm0
+
+ punpcklwd mm0, mm3
+ punpckhwd mm1, mm3
+
+ paddw mm0, mm1
+ movq mm1, mm0
+
+ psrlq mm0, 32
+ paddw mm0, mm1
+
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad16x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp9_sad16x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x8sad_mmx_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+
+ movq mm2, [rsi+8]
+ movq mm3, [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpckhbw mm1, mm6
+
+ punpcklbw mm2, mm6
+ punpckhbw mm3, mm6
+
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+ paddw mm0, mm1
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x16x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse2.asm
@@ -1,0 +1,410 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_sad16x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp9_sad16x16_wmt)
+sym(vp9_sad16x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor xmm6, xmm6
+
+.x16x16sad_wmt_loop:
+
+ movq xmm0, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rsi+8]
+
+ movq xmm1, QWORD PTR [rdi]
+ movq xmm3, QWORD PTR [rdi+8]
+
+ movq xmm4, QWORD PTR [rsi+rax]
+ movq xmm5, QWORD PTR [rdi+rdx]
+
+
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm3
+
+ psadbw xmm0, xmm1
+ movq xmm2, QWORD PTR [rsi+rax+8]
+
+ movq xmm3, QWORD PTR [rdi+rdx+8]
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ punpcklbw xmm4, xmm2
+
+ punpcklbw xmm5, xmm3
+ psadbw xmm4, xmm5
+
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
+
+ cmp rsi, rcx
+ jne .x16x16sad_wmt_loop
+
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movq rax, xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp9_sad8x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+global sym(vp9_sad8x16_wmt)
+sym(vp9_sad8x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+.x8x16sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ jg .x8x16sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, QWORD PTR [rsi+rbx]
+ movq mm3, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm7, mm0
+ paddw mm7, mm2
+
+ cmp rsi, rcx
+ jne .x8x16sad_wmt_loop
+
+ movq rax, mm7
+
+.x8x16sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad8x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp9_sad8x8_wmt)
+sym(vp9_sad8x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x8x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ jg .x8x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rbx]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x8x8sad_wmt_loop
+
+ movq rax, mm7
+.x8x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp9_sad4x4_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp9_sad4x4_wmt)
+sym(vp9_sad4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ movd mm4, DWORD PTR [rsi]
+
+ movd mm5, DWORD PTR [rdi]
+ movd mm6, DWORD PTR [rsi+rax]
+
+ movd mm7, DWORD PTR [rdi+rdx]
+ punpcklbw mm4, mm6
+
+ punpcklbw mm5, mm7
+ psadbw mm4, mm5
+
+ paddw mm0, mm4
+ movq rax, mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_sad16x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp9_sad16x8_wmt)
+sym(vp9_sad16x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x16x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ jg .x16x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne .x16x8sad_wmt_loop
+
+ movq rax, mm7
+
+.x16x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp9_copy32xn_sse2)
+sym(vp9_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge .block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je .copy_is_done
+
+.block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne .block_copy_sse2_loop
+
+.copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse3.asm
@@ -1,0 +1,960 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_err arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_err [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_err r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_err
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define r0_ptr rcx
+ %define r1_ptr rdx
+ %define r2_ptr rbx
+ %define r3_ptr rdi
+ %define ref_stride rbp
+ %define result_ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ; src_ptr
+
+ movsxd rbx, dword ptr arg(1) ; src_stride
+ movsxd rbp, dword ptr arg(3) ; ref_stride
+
+ xchg rbx, rax
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define r0_ptr rsi
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr r8
+ %define ref_stride r9
+ %define result_ptr [rsp+xmm_stack_space+16+4*8]
+ push rsi
+
+ LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define r0_ptr r9
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr rdx
+ %define ref_stride rcx
+ %define result_ptr r8
+
+ LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+ %define src_ptr
+ %define src_stride
+ %define r0_ptr
+ %define r1_ptr
+ %define r2_ptr
+ %define r3_ptr
+ %define ref_stride
+ %define result_ptr
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm5, XMMWORD PTR [%3]
+ lddqu xmm6, XMMWORD PTR [%3+1]
+ lddqu xmm7, XMMWORD PTR [%3+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%3+1]
+ lddqu xmm3, XMMWORD PTR [%3+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%4]
+ lddqu xmm1, XMMWORD PTR [%3+%5]
+ lddqu xmm2, XMMWORD PTR [%3+%5+1]
+ lddqu xmm3, XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm5, QWORD PTR [%3]
+ movq mm6, QWORD PTR [%3+1]
+ movq mm7, QWORD PTR [%3+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%3+1]
+ movq mm3, QWORD PTR [%3+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [%2+%4]
+ movq mm1, QWORD PTR [%3+%5]
+ movq mm2, QWORD PTR [%3+%5+1]
+ movq mm3, QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+ mov %2, [%1+REG_SZ_BYTES*0]
+ mov %3, [%1+REG_SZ_BYTES*1]
+
+ mov %4, [%1+REG_SZ_BYTES*2]
+ mov %5, [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 8
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm4, XMMWORD PTR [%3]
+ lddqu xmm5, XMMWORD PTR [%4]
+ lddqu xmm6, XMMWORD PTR [%5]
+ lddqu xmm7, XMMWORD PTR [%6]
+
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%4]
+ lddqu xmm3, XMMWORD PTR [%5]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%7]
+ lddqu xmm1, XMMWORD PTR [%3+%8]
+ lddqu xmm2, XMMWORD PTR [%4+%8]
+ lddqu xmm3, XMMWORD PTR [%5+%8]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6+%8]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 8
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm4, QWORD PTR [%3]
+ movq mm5, QWORD PTR [%4]
+ movq mm6, QWORD PTR [%5]
+ movq mm7, QWORD PTR [%6]
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%4]
+ movq mm3, QWORD PTR [%5]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+%endif
+ movq mm0, QWORD PTR [%2+%7]
+ movq mm1, QWORD PTR [%3+%8]
+ movq mm2, QWORD PTR [%4+%8]
+ movq mm3, QWORD PTR [%5+%8]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6+%8]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw mm1, mm0
+ paddw mm7, mm1
+
+%endmacro
+
+;void int vp9_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x16x3_sse3)
+sym(vp9_sad16x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x8x3_sse3)
+sym(vp9_sad16x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad8x16x3_sse3)
+sym(vp9_sad8x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad8x8x3_sse3)
+sym(vp9_sad8x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp9_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad4x4x3_sse3)
+sym(vp9_sad4x4x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [ref_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [ref_ptr+1]
+ movd mm5, DWORD PTR [ref_ptr+2]
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [ref_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, DWORD PTR [ref_ptr+1]
+ movd mm7, DWORD PTR [ref_ptr+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rcx, result_ptr
+
+ punpckldq mm1, mm3
+
+ movq [rcx], mm1
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;unsigned int vp9_sad16x16_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_err)
+;%define lddqu movdqu
+global sym(vp9_sad16x16_sse3)
+sym(vp9_sad16x16_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ mov end_ptr, 4
+ pxor xmm7, xmm7
+
+.vp9_sad16x16_sse3_loop:
+ movdqa xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [ref_ptr]
+ movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
+ movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa xmm4, XMMWORD PTR [src_ptr]
+ movdqu xmm5, XMMWORD PTR [ref_ptr]
+ movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
+
+ psadbw xmm0, xmm1
+
+ movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
+
+ psadbw xmm2, xmm3
+ psadbw xmm4, xmm5
+ psadbw xmm6, xmm1
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ paddw xmm7, xmm0
+ paddw xmm7, xmm2
+ paddw xmm7, xmm4
+ paddw xmm7, xmm6
+
+ sub end_ptr, 1
+ jne .vp9_sad16x16_sse3_loop
+
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+ paddw xmm0, xmm7
+ movq rax, xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void vp9_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp9_copy32xn_sse3)
+sym(vp9_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge .block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je .copy_is_done
+
+.block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne .block_copy_sse3_loop
+
+.copy_is_done:
+ STACK_FRAME_DESTROY_X3
+
+;void vp9_sad16x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x16x4d_sse3)
+sym(vp9_sad16x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void vp9_sad16x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x8x4d_sse3)
+sym(vp9_sad16x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad8x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad8x16x4d_sse3)
+sym(vp9_sad8x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad8x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad8x8x4d_sse3)
+sym(vp9_sad8x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp9_sad4x4x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad4x4x4d_sse3)
+sym(vp9_sad4x4x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [r0_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [r1_ptr]
+ movd mm5, DWORD PTR [r2_ptr]
+
+ movd mm6, DWORD PTR [r3_ptr]
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+
+ movd mm3, DWORD PTR [r2_ptr+ref_stride]
+ movd mm7, DWORD PTR [r3_ptr+ref_stride]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ punpcklbw mm6, mm7
+ psadbw mm4, mm0
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+
+
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea r0_ptr, [r0_ptr+ref_stride*2]
+
+ lea r1_ptr, [r1_ptr+ref_stride*2]
+ lea r2_ptr, [r2_ptr+ref_stride*2]
+
+ lea r3_ptr, [r3_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [r0_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm7, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm7
+
+ movd mm3, DWORD PTR [r1_ptr]
+ movd mm7, DWORD PTR [r2_ptr]
+
+ psadbw mm2, mm0
+%if ABI_IS_32BIT
+ mov rax, rbp
+
+ pop rbp
+%define ref_stride rax
+%endif
+ mov rsi, result_ptr
+
+ paddw mm1, mm2
+ movd [rsi], mm1
+
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+ movd mm1, DWORD PTR [r2_ptr+ref_stride]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm1
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ movd mm2, DWORD PTR [r3_ptr]
+ movd mm1, DWORD PTR [r3_ptr+ref_stride]
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ movd [rsi+4], mm3
+ punpcklbw mm2, mm1
+
+ movd [rsi+8], mm7
+ psadbw mm2, mm0
+
+ paddw mm2, mm6
+ movd [rsi+12], mm2
+
+
+ STACK_FRAME_DESTROY_X4
+
--- /dev/null
+++ b/vp9/encoder/x86/sad_sse4.asm
@@ -1,0 +1,353 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+
+;void vp9_sad16x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(vp9_sad16x16x8_sse4)
+sym(vp9_sad16x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_sad16x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp9_sad16x8x8_sse4)
+sym(vp9_sad16x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_sad8x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp9_sad8x8x8_sse4)
+sym(vp9_sad8x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_sad8x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp9_sad8x16x8_sse4)
+sym(vp9_sad8x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_sad4x4x8_c(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp9_sad4x4x8_sse4)
+sym(vp9_sad4x4x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
--- /dev/null
+++ b/vp9/encoder/x86/sad_ssse3.asm
@@ -1,0 +1,370 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm7, XMMWORD PTR [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm3, XMMWORD PTR [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ movdqa xmm4, XMMWORD PTR [rdi+rdx]
+ movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int vp9_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x16x3_ssse3)
+sym(vp9_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp9_sad16x16x3_ssse3_skiptable
+.vp9_sad16x16x3_ssse3_jumptable:
+ dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
+ dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
+.vp9_sad16x16x3_ssse3_skiptable:
+
+ call .vp9_sad16x16x3_ssse3_do_jump
+.vp9_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
+
+.vp9_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp9_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp9_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp9_sad16x8x3_ssse3)
+sym(vp9_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp9_sad16x8x3_ssse3_skiptable
+.vp9_sad16x8x3_ssse3_jumptable:
+ dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
+ dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
+.vp9_sad16x8x3_ssse3_skiptable:
+
+ call .vp9_sad16x8x3_ssse3_do_jump
+.vp9_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
+
+.vp9_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp9_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/ssim_opt.asm
@@ -1,0 +1,216 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_16x16_sse2)
+sym(vp9_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp9_ssim_parms_8x8_sse2)
+sym(vp9_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/subtract_mmx.asm
@@ -1,0 +1,432 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp9_subtract_b_mmx_impl)
+sym(vp9_subtract_b_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi], mm0
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2],mm0
+
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp9_subtract_mby_mmx)
+sym(vp9_subtract_mby_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 16
+ pxor mm0, mm0
+
+.submby_loop:
+
+ movq mm1, [rsi]
+ movq mm3, [rax]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi], mm1
+ movq [rdi+8], mm2
+
+
+ movq mm1, [rsi+8]
+ movq mm3, [rax+8]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi+16], mm1
+ movq [rdi+24], mm2
+
+
+ add rdi, 32
+ add rax, 16
+
+ lea rsi, [rsi+rdx]
+
+ sub rcx, 1
+ jnz .submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp9_subtract_mbuv_mmx)
+sym(vp9_subtract_mbuv_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;short *udiff = diff + 256;
+ ;short *vdiff = diff + 320;
+ ;unsigned char *upred = pred + 256;
+ ;unsigned char *vpred = pred + 320;
+
+ ;unsigned char *z = usrc;
+ ;unsigned short *diff = udiff;
+ ;unsigned char *Predictor= upred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ;unsigned char *z = vsrc;
+ ;unsigned short *diff = vdiff;
+ ;unsigned char *Predictor= vpred;
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(2) ;z = usrc
+ add rdi, 320*2 ;diff = diff + 320 (shorts)
+ add rax, 320 ;Predictor = pred + 320
+ movsxd rdx, dword ptr arg(4) ;stride;
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+8]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+16], mm0
+ movq [rdi+24], mm3
+
+ movq mm0, [rsi+rdx*2]
+ movq mm1, [rax+16]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi+32], mm0
+ movq [rdi+40], mm3
+ lea rsi, [rsi+rdx*2]
+
+
+ movq mm0, [rsi+rdx]
+ movq mm1, [rax+24]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+
+ movq [rdi+48], mm0
+ movq [rdi+56], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vp9/encoder/x86/subtract_sse2.asm
@@ -1,0 +1,356 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp9_subtract_b_sse2_impl)
+sym(vp9_subtract_b_sse2_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi], mm0
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp9_subtract_mby_sse2)
+sym(vp9_subtract_mby_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(1) ;src
+ mov rdi, arg(0) ;diff
+
+ mov rax, arg(2) ;pred
+ movsxd rdx, dword ptr arg(3) ;stride
+
+ mov rcx, 8 ; do two lines at one time
+
+.submby_loop:
+ movdqa xmm0, XMMWORD PTR [rsi] ; src
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ movdqa xmm4, XMMWORD PTR [rsi + rdx]
+ movdqa xmm5, XMMWORD PTR [rax + 16]
+
+ movdqa xmm6, xmm4
+ psubb xmm4, xmm5
+
+ pxor xmm5, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm6, [GLOBAL(t80)]
+ pcmpgtb xmm5, xmm6 ; obtain sign information
+
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ punpcklbw xmm4, xmm5 ; put sign back to subtraction
+ punpckhbw xmm6, xmm7 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi +32], xmm4
+ movdqa XMMWORD PTR [rdi +48], xmm6
+
+ add rdi, 64
+ add rax, 32
+ lea rsi, [rsi+rdx*2]
+
+ sub rcx, 1
+ jnz .submby_loop
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp9_subtract_mbuv_sse2)
+sym(vp9_subtract_mbuv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rax, arg(3) ;pred
+ mov rsi, arg(1) ;z = usrc
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ add rax, 256 ;Predictor = pred + 256
+ movsxd rdx, dword ptr arg(4) ;stride;
+ lea rcx, [rdx + rdx*2]
+
+ ;u
+ ;line 0 1
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 32], xmm0
+ movdqa XMMWORD PTR [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 64], xmm0
+ movdqa XMMWORD PTR [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 96], xmm0
+ movdqa XMMWORD PTR [rdi + 112], xmm2
+
+ ;v
+ mov rsi, arg(2) ;z = vsrc
+ add rdi, 64*2 ;diff = diff + 320 (shorts)
+ add rax, 64 ;Predictor = pred + 320
+
+ ;line 0 1
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi +16], xmm2
+
+ ;line 2 3
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+16] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 32], xmm0
+ movdqa XMMWORD PTR [rdi + 48], xmm2
+
+ ;line 4 5
+ lea rsi, [rsi + rdx*4]
+
+ movq xmm0, MMWORD PTR [rsi] ; src
+ movq xmm2, MMWORD PTR [rsi+rdx]
+ movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 64], xmm0
+ movdqa XMMWORD PTR [rdi + 80], xmm2
+
+ ;line 6 7
+ movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
+ movq xmm2, MMWORD PTR [rsi+rcx]
+ movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
+ punpcklqdq xmm0, xmm2
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, [GLOBAL(t80)] ;convert to signed values
+ pxor xmm2, [GLOBAL(t80)]
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa XMMWORD PTR [rdi + 96], xmm0
+ movdqa XMMWORD PTR [rdi + 112], xmm2
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t80:
+ times 16 db 0x80
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_apply_sse2.asm
@@ -1,0 +1,207 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp9_temporal_filter_apply_sse2 | arg
+; (unsigned char *frame1, | 0
+; unsigned int stride, | 1
+; unsigned char *frame2, | 2
+; unsigned int block_size, | 3
+; int strength, | 4
+; int filter_weight, | 5
+; unsigned int *accumulator, | 6
+; unsigned short *count) | 7
+global sym(vp9_temporal_filter_apply_sse2)
+sym(vp9_temporal_filter_apply_sse2):
+
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ALIGN_STACK 16, rax
+ %define block_size 0
+ %define strength 16
+ %define filter_weight 32
+ %define rounding_bit 48
+ %define rbp_backup 64
+ %define stack_size 80
+ sub rsp, stack_size
+ mov [rsp + rbp_backup], rbp
+ ; end prolog
+
+ mov rdx, arg(3)
+ mov [rsp + block_size], rdx
+ movd xmm6, arg(4)
+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+ ; calculate the rounding bit outside the loop
+ ; 0x8000 >> (16 - strength)
+ mov rdx, 16
+ sub rdx, arg(4) ; 16 - strength
+ movd xmm4, rdx ; can't use rdx w/ shift
+ movdqa xmm5, [GLOBAL(_const_top_bit)]
+ psrlw xmm5, xmm4
+ movdqa [rsp + rounding_bit], xmm5
+
+ mov rsi, arg(0) ; src/frame1
+ mov rdx, arg(2) ; predictor frame
+ mov rdi, arg(6) ; accumulator
+ mov rax, arg(7) ; count
+
+ ; dup the filter weight and store for later
+ movd xmm0, arg(5) ; filter_weight
+ pshuflw xmm0, xmm0, 0
+ punpcklwd xmm0, xmm0
+ movdqa [rsp + filter_weight], xmm0
+
+ mov rbp, arg(1) ; stride
+ pxor xmm7, xmm7 ; zero for extraction
+
+ lea rcx, [rdx + 16*16*1]
+ cmp dword ptr [rsp + block_size], 8
+ jne .temporal_filter_apply_load_16
+ lea rcx, [rdx + 8*8*1]
+
+.temporal_filter_apply_load_8:
+ movq xmm0, [rsi] ; first row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ movq xmm1, [rsi] ; second row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm1, xmm7 ; src[ 8-15]
+ jmp .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+ movdqa xmm0, [rsi] ; src (frame1)
+ lea rsi, [rsi + rbp] ; += stride
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ punpckhbw xmm1, xmm7 ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+ movdqa xmm2, [rdx] ; predictor (frame2)
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm3, xmm7 ; pred[ 8-15]
+
+ ; modifier = src_byte - pixel_value
+ psubw xmm0, xmm2 ; src - pred[ 0- 7]
+ psubw xmm1, xmm3 ; src - pred[ 8-15]
+
+ ; modifier *= modifier
+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2
+
+ ; modifier *= 3
+ pmullw xmm0, [GLOBAL(_const_3w)]
+ pmullw xmm1, [GLOBAL(_const_3w)]
+
+ ; modifer += 0x8000 >> (16 - strength)
+ paddw xmm0, [rsp + rounding_bit]
+ paddw xmm1, [rsp + rounding_bit]
+
+ ; modifier >>= strength
+ psrlw xmm0, [rsp + strength]
+ psrlw xmm1, [rsp + strength]
+
+ ; modifier = 16 - modifier
+ ; saturation takes care of modifier > 16
+ movdqa xmm3, [GLOBAL(_const_16w)]
+ movdqa xmm2, [GLOBAL(_const_16w)]
+ psubusw xmm3, xmm1
+ psubusw xmm2, xmm0
+
+ ; modifier *= filter_weight
+ pmullw xmm2, [rsp + filter_weight]
+ pmullw xmm3, [rsp + filter_weight]
+
+ ; count
+ movdqa xmm4, [rax]
+ movdqa xmm5, [rax+16]
+ ; += modifier
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ ; write back
+ movdqa [rax], xmm4
+ movdqa [rax+16], xmm5
+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
+
+ ; load and extract the predictor up to shorts
+ pxor xmm7, xmm7
+ movdqa xmm0, [rdx]
+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm1, xmm7 ; pred[ 8-15]
+
+ ; modifier *= pixel_value
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ ; expand to double words
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm7 ; [ 0- 3]
+ punpckhwd xmm2, xmm7 ; [ 4- 7]
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm7 ; [ 8-11]
+ punpckhwd xmm3, xmm7 ; [12-15]
+
+ ; accumulator
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rdi+16]
+ movdqa xmm6, [rdi+32]
+ movdqa xmm7, [rdi+48]
+ ; += modifier
+ paddd xmm4, xmm0
+ paddd xmm5, xmm2
+ paddd xmm6, xmm1
+ paddd xmm7, xmm3
+ ; write back
+ movdqa [rdi], xmm4
+ movdqa [rdi+16], xmm5
+ movdqa [rdi+32], xmm6
+ movdqa [rdi+48], xmm7
+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+ cmp rdx, rcx
+ je .temporal_filter_apply_epilog
+ pxor xmm7, xmm7 ; zero for extraction
+ cmp dword ptr [rsp + block_size], 16
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+ ; begin epilog
+ mov rbp, [rsp + rbp_backup]
+ add rsp, stack_size
+ pop rsp
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+ times 8 dw 3
+align 16
+_const_top_bit:
+ times 8 dw 1<<15
+align 16
+_const_16w
+ times 8 dw 16
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_x86.h
@@ -1,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TEMPORAL_FILTER_X86_H
+#define __INC_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp9_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp9_temporal_filter_apply
+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_TEMPORAL_FILTER_X86_H
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_mmx.asm
@@ -1,0 +1,851 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
+global sym(vp9_get_mb_ss_mmx)
+sym(vp9_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+.NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja .NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp9_get8x8var_mmx)
+sym(vp9_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp9_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp9_get4x4var_mmx)
+sym(vp9_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp9_get4x4sse_cs_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride
+;)
+global sym(vp9_get4x4sse_cs_mmx)
+sym(vp9_get4x4sse_cs_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+ ; Row 1
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 2
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm1, mm6
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movq rax, mm0
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%define mmx_filter_shift 7
+
+;void vp9_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_filter_block2d_bil4x4_var_mmx)
+sym(vp9_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;void vp9_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_filter_block2d_bil_var_mmx)
+sym(vp9_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_sse2.asm
@@ -1,0 +1,1367 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+;unsigned int vp9_get_mb_ss_sse2
+;(
+; short *src_ptr
+;)
+global sym(vp9_get_mb_ss_sse2)
+sym(vp9_get_mb_ss_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 1
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ mov rax, arg(0) ;[src_ptr]
+ mov rcx, 8
+ pxor xmm4, xmm4
+
+.NEXTROW:
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax+16]
+ movdqa xmm2, [rax+32]
+ movdqa xmm3, [rax+48]
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+ paddd xmm4, xmm0
+ paddd xmm4, xmm2
+
+ add rax, 0x40
+ dec rcx
+ ja .NEXTROW
+
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,4
+ paddd xmm4,xmm3
+ movq rax,xmm4
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp9_get16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_get16x16var_sse2)
+sym(vp9_get16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ ; Prefetch data
+ lea rcx, [rax+rax*2]
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax*2]
+ prefetcht0 [rsi+rcx]
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax*2]
+ prefetcht0 [rbx+rcx]
+
+ lea rcx, [rdx+rdx*2]
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx*2]
+ prefetcht0 [rdi+rcx]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx*2]
+ prefetcht0 [rbx+rcx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ prefetcht0 [rsi+rax*8]
+ prefetcht0 [rdi+rdx*8]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz .var16loop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd DWORD PTR [rax], xmm7
+ movd DWORD PTR [rdi], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;unsigned int vp9_get8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp9_get8x8var_sse2)
+sym(vp9_get8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ movq xmm1, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rdi]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ psubsw xmm1, xmm2
+ paddw xmm7, xmm1
+
+ pmaddwd xmm1, xmm1
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movq xmm2, QWORD PTR[rsi + rax * 2]
+ movq xmm3, QWORD PTR[rdi + rdx * 2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+
+ punpckhwd xmm7, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm6, xmm7
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddw xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddw xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movq rdx, xmm7
+ movsx rcx, dx
+
+ mov dword ptr [rax], ecx
+ movd DWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block2d_bil_var_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+global sym(vp9_filter_block2d_bil_var_sse2)
+sym(vp9_filter_block2d_bil_var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ pxor xmm6, xmm6 ;
+ pxor xmm7, xmm7 ;
+
+ lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
+ movdqa xmm4, XMMWORD PTR [rsi]
+
+ lea rcx, [GLOBAL(bilinear_filters_sse2)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_sse2_sp_only
+
+ shl rax, 5 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_sse2_fp_only
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+ movdqa xmm5, xmm1
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movdqa xmm3, xmm5 ;
+ movdqa xmm5, xmm1 ;
+
+ pmullw xmm3, [rdx] ;
+ pmullw xmm1, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_sse2_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
+ je filter_block2d_bil_var_sse2_full_pixel
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movq xmm3, QWORD PTR [rsi] ;
+ punpcklbw xmm3, xmm0 ;
+ movdqa xmm5, xmm3
+
+ pmullw xmm1, [rdx] ;
+ pmullw xmm3, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ movdqa xmm1, xmm5 ;
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_sp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0 ;
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movq xmm2, QWORD PTR [rdi] ;
+ punpcklbw xmm2, xmm0 ;
+
+ psubw xmm1, xmm2 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_full_pixel_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_fp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(7) ; sum
+ mov rdi, arg(8) ; sumsquared
+
+ movd [rsi], mm2 ; xsum
+ movd [rdi], mm4 ; xxsum
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_half_horiz_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance8x_h_sse2)
+sym(vp9_half_horiz_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+%else
+ add rsi, r8
+%endif
+
+.half_horiz_vert_variance8x_h_1:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm2, QWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz .half_horiz_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance16x_h_sse2)
+sym(vp9_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+.half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz .half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_half_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance8x_h_sse2)
+sym(vp9_half_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+.half_vert_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz .half_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance16x_h_sse2)
+sym(vp9_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr
+
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+.half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz .half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp9_half_horiz_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance8x_h_sse2)
+sym(vp9_half_horiz_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+.half_horiz_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .half_horiz_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance16x_h_sse2)
+sym(vp9_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+.half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz .half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
+ dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- /dev/null
+++ b/vp9/encoder/x86/variance_impl_ssse3.asm
@@ -1,0 +1,372 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+
+;void vp9_filter_block2d_bil_var_ssse3
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp9_filter_block2d_bil_var_ssse3)
+sym(vp9_filter_block2d_bil_var_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ lea rcx, [GLOBAL(bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .filter_block2d_bil_var_ssse3_sp_only
+
+ shl rax, 4 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je .filter_block2d_bil_var_ssse3_fp_only
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi+1]
+ movdqa xmm2, xmm0
+
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, [rax]
+ pmaddubsw xmm2, [rax]
+
+ paddw xmm0, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm0, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ packuswb xmm0, xmm2
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + r8]
+%endif
+
+.filter_block2d_bil_var_ssse3_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+ packuswb xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ movdqa xmm0, xmm1
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm2, xmm1
+ punpckhbw xmm3, xmm1
+ pmaddubsw xmm2, [rdx]
+ pmaddubsw xmm3, [rdx]
+
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm2, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm1, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm1, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm2, xmm1
+ psubw xmm3, xmm5
+ paddw xmm6, xmm2
+ paddw xmm6, xmm3
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm2
+ paddd xmm7, xmm3
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rsi, [rsi + r8]
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_var_ssse3_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; Both xoffset =0 and yoffset=0
+ je .filter_block2d_bil_var_ssse3_full_pixel
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqa xmm0, xmm1
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ lea rsi, [rsi + rax]
+
+.filter_block2d_bil_sp_only_loop:
+ movdqu xmm3, XMMWORD PTR [rsi]
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+
+ punpcklbw xmm1, xmm3
+ punpckhbw xmm2, xmm3
+ pmaddubsw xmm1, [rdx]
+ pmaddubsw xmm2, [rdx]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ movq xmm3, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ movdqa xmm1, xmm0
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_sp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ movq xmm2, QWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+
+ movq xmm3, QWORD PTR [rdi]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [rdi+8]
+ punpcklbw xmm4, xmm0
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm4
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rdx] ;src_pixels_per_line
+ sub rcx, 1
+ jnz .filter_block2d_bil_full_pixel_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+.filter_block2d_bil_fp_only_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm2, XMMWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm2, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm3
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm1
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rdx]
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_fp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_variance:
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(7) ;[Sum]
+ mov rdi, arg(8) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 120, 8
+ times 8 db 112, 16
+ times 8 db 104, 24
+ times 8 db 96, 32
+ times 8 db 88, 40
+ times 8 db 80, 48
+ times 8 db 72, 56
+ times 8 db 64, 64
+ times 8 db 56, 72
+ times 8 db 48, 80
+ times 8 db 40, 88
+ times 8 db 32, 96
+ times 8 db 24, 104
+ times 8 db 16, 112
+ times 8 db 8, 120
--- /dev/null
+++ b/vp9/encoder/x86/variance_mmx.c
@@ -1,0 +1,406 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+ const unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+ const short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *vp7_filter
+);
+
+extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get8x8var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp9_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp9_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp9_filter_block2d_bil_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+
+unsigned int vp9_variance4x4_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp9_variance8x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+ *sse = var;
+
+ return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp9_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+
+unsigned int vp9_variance16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp9_variance16x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp9_variance8x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 120, 120, 120, 120, 8, 8, 8, 8 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 104, 104, 104, 104, 24, 24, 24, 24 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 88, 88, 88, 88, 40, 40, 40, 40 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 72, 72, 72, 72, 56, 56, 56, 56 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 56, 56, 56, 56, 72, 72, 72, 72 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 40, 40, 40, 40, 88, 88, 88, 88 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 24, 24, 24, 24, 104, 104, 104, 104 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 },
+ { 8, 8, 8, 8, 120, 120, 120, 120 }
+};
+
+unsigned int vp9_sub_pixel_variance4x4_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+
+{
+ int xsum;
+ unsigned int xxsum;
+ vp9_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+
+ int xsum;
+ unsigned int xxsum;
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp9_sub_pixel_variance16x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp9_sub_pixel_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp9_sub_pixel_variance16x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp9_sub_pixel_variance8x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum;
+ unsigned int xxsum;
+ vp9_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_h_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
+ ref_ptr, recon_stride, sse);
+}
--- /dev/null
+++ b/vp9/encoder/x86/variance_sse2.c
@@ -1,0 +1,517 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+#define HALFNDX 8
+
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp9_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+extern unsigned int vp9_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+
+unsigned int vp9_get_mb_ss_sse2
+(
+ const short *src_ptr
+);
+unsigned int vp9_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+unsigned int vp9_get8x8var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+void vp9_filter_block2d_bil_var_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_horiz_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_horiz_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp9_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);
+
+unsigned int vp9_variance4x4_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp9_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int var;
+ int avg;
+
+ vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+ *sse = var;
+ return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp9_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0;
+ int sum0;
+
+
+ vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ *sse = sse0;
+ return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp9_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+
+ unsigned int sse0;
+ int sum0;
+ vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ *sse = sse0;
+ return sse0;
+
+}
+
+
+unsigned int vp9_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp9_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
+ vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp9_sub_pixel_variance4x4_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum;
+ unsigned int xxsum;
+ vp9_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp9_sub_pixel_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ } else {
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp9_sub_pixel_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else {
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0
+ );
+
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum1, &xxsum1
+ );
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp9_sub_pixel_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp9_sub_pixel_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else {
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp9_sub_pixel_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ } else {
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((xsum * xsum) >> 7));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_h_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_v_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp9_variance_halfpixvar16x16_hv_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
--- /dev/null
+++ b/vp9/encoder/x86/variance_ssse3.c
@@ -1,0 +1,151 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+#define HALFNDX 8
+
+extern unsigned int vp9_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp9_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp9_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp9_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp9_filter_block2d_bil_var_ssse3
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp9_sub_pixel_variance16x16_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ } else {
+ vp9_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp9_sub_pixel_variance16x8_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+) {
+ int xsum0;
+ unsigned int xxsum0;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ } else {
+ vp9_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
--- /dev/null
+++ b/vp9/encoder/x86/x86_csystemdependent.c
@@ -1,0 +1,114 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "vp9/encoder/variance.h"
+#include "vp9/encoder/onyx_int.h"
+
+
+#if HAVE_MMX
+void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
+ vp9_short_fdct4x4_mmx(input, output, pitch);
+ vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp9_mbuverror_mmx(MACROBLOCK *mb) {
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp9_mbuverror_xmm(MACROBLOCK *mb) {
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {
+#if CONFIG_RUNTIME_CPU_DETECT
+ int flags = x86_simd_caps();
+
+ /* Note:
+ *
+ * This platform can be built without runtime CPU detection as well. If
+ * you modify any of the function mappings present in this file, be sure
+ * to also update them in static mapings (<arch>/filename_<arch>.h)
+ */
+
+ /* Override default functions with fastest ones for this CPU. */
+#if HAVE_SSE2
+ if (flags & HAS_SSE2) {
+ cpi->rtcd.temporal.apply = vp9_temporal_filter_apply_sse2;
+
+ }
+#endif
+
+#if HAVE_SSE3
+ if (flags & HAS_SSE3) {
+ cpi->rtcd.search.full_search = vp9_full_search_sadx3;
+ cpi->rtcd.search.diamond_search = vp9_diamond_search_sadx4;
+ cpi->rtcd.search.refining_search = vp9_refining_search_sadx4;
+ }
+#endif
+
+
+#if HAVE_SSE4_1
+ if (flags & HAS_SSE4_1) {
+ cpi->rtcd.search.full_search = vp9_full_search_sadx8;
+ }
+#endif
+
+#endif
+}
--- /dev/null
+++ b/vp9/exports_dec
@@ -1,0 +1,2 @@
+data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
--- /dev/null
+++ b/vp9/exports_enc
@@ -1,0 +1,4 @@
+data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
+data vpx_codec_vp8x_cx_algo
+text vpx_codec_vp8x_cx
--- /dev/null
+++ b/vp9/vp9_common.mk
@@ -1,0 +1,179 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+VP9_COMMON_SRCS-yes += vp9_common.mk
+VP9_COMMON_SRCS-yes += common/type_aliases.h
+VP9_COMMON_SRCS-yes += common/pragmas.h
+VP9_COMMON_SRCS-yes += common/ppflags.h
+VP9_COMMON_SRCS-yes += common/onyx.h
+VP9_COMMON_SRCS-yes += common/onyxd.h
+VP9_COMMON_SRCS-yes += common/alloccommon.c
+VP9_COMMON_SRCS-yes += common/asm_com_offsets.c
+VP9_COMMON_SRCS-yes += common/blockd.c
+VP9_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP9_COMMON_SRCS-yes += common/debugmodes.c
+VP9_COMMON_SRCS-yes += common/entropy.c
+VP9_COMMON_SRCS-yes += common/entropymode.c
+VP9_COMMON_SRCS-yes += common/entropymv.c
+VP9_COMMON_SRCS-yes += common/extend.c
+VP9_COMMON_SRCS-yes += common/filter.c
+VP9_COMMON_SRCS-yes += common/filter.h
+VP9_COMMON_SRCS-yes += common/findnearmv.c
+VP9_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP9_COMMON_SRCS-yes += common/idctllm.c
+VP9_COMMON_SRCS-yes += common/alloccommon.h
+VP9_COMMON_SRCS-yes += common/blockd.h
+VP9_COMMON_SRCS-yes += common/common.h
+VP9_COMMON_SRCS-yes += common/common_types.h
+VP9_COMMON_SRCS-yes += common/entropy.h
+VP9_COMMON_SRCS-yes += common/entropymode.h
+VP9_COMMON_SRCS-yes += common/entropymv.h
+VP9_COMMON_SRCS-yes += common/extend.h
+VP9_COMMON_SRCS-yes += common/findnearmv.h
+VP9_COMMON_SRCS-yes += common/header.h
+VP9_COMMON_SRCS-yes += common/idct.h
+VP9_COMMON_SRCS-yes += common/invtrans.h
+VP9_COMMON_SRCS-yes += common/loopfilter.h
+VP9_COMMON_SRCS-yes += common/modecont.h
+VP9_COMMON_SRCS-yes += common/mv.h
+VP9_COMMON_SRCS-yes += common/onyxc_int.h
+VP9_COMMON_SRCS-yes += common/pred_common.h
+VP9_COMMON_SRCS-yes += common/pred_common.c
+VP9_COMMON_SRCS-yes += common/quant_common.h
+VP9_COMMON_SRCS-yes += common/reconinter.h
+VP9_COMMON_SRCS-yes += common/reconintra.h
+VP9_COMMON_SRCS-yes += common/reconintra4x4.h
+VP9_COMMON_SRCS-yes += common/rtcd.c
+VP9_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP9_COMMON_SRCS-yes += common/sadmxn.h
+VP9_COMMON_SRCS-yes += common/seg_common.h
+VP9_COMMON_SRCS-yes += common/seg_common.c
+VP9_COMMON_SRCS-yes += common/setupintrarecon.h
+VP9_COMMON_SRCS-yes += common/subpixel.h
+VP9_COMMON_SRCS-yes += common/swapyv12buffer.h
+VP9_COMMON_SRCS-yes += common/systemdependent.h
+VP9_COMMON_SRCS-yes += common/treecoder.h
+VP9_COMMON_SRCS-yes += common/invtrans.c
+VP9_COMMON_SRCS-yes += common/loopfilter.c
+VP9_COMMON_SRCS-yes += common/loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/mbpitch.c
+VP9_COMMON_SRCS-yes += common/modecont.c
+VP9_COMMON_SRCS-yes += common/modecontext.c
+VP9_COMMON_SRCS-yes += common/mvref_common.c
+VP9_COMMON_SRCS-yes += common/mvref_common.h
+VP9_COMMON_SRCS-yes += common/quant_common.c
+VP9_COMMON_SRCS-yes += common/recon.c
+VP9_COMMON_SRCS-yes += common/reconinter.c
+VP9_COMMON_SRCS-yes += common/reconintra.c
+VP9_COMMON_SRCS-yes += common/reconintra4x4.c
+VP9_COMMON_SRCS-yes += common/setupintrarecon.c
+VP9_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP9_COMMON_SRCS-yes += common/treecoder.c
+VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c
+
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+ifeq ($(CONFIG_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+# common (c)
+ifeq ($(CONFIG_CSM),yes)
+VP9_COMMON_SRCS-yes += common/maskingmv.c
+VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c
+ifeq ($(HAVE_SSE4_1),yes)
+vp9/common/x86/filter_sse4.c.o: CFLAGS += -msse4
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c
+ifeq ($(HAVE_SSE2),yes)
+vp9/common/x86/filter_sse2.c.o: CFLAGS += -msse2
+vp9/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
+vp9/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2
+endif
+
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/idct_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/recon_arm.h
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
+VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/subpixel_arm.h
+
+# common (armv6)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+
+# common (neon)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
+VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c
--- /dev/null
+++ b/vp9/vp9_cx_iface.c
@@ -1,0 +1,1169 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx/vpx_codec.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "vp9/encoder/onyx_int.h"
+#include "vpx/vp8e.h"
+#include "vp9/encoder/firstpass.h"
+#include "vp9/common/onyx.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* This value is a sentinel for determining whether the user has set a mode
+ * directly through the deprecated VP8E_SET_ENCODING_MODE control.
+ */
+#define NO_MODE_SET 255
+
+struct vp8_extracfg {
+ struct vpx_codec_pkt_list *pkt_list;
+ vp8e_encoding_mode encoding_mode; /** best, good, realtime */
+ int cpu_used; /** available cpu percentage in 1/16*/
+ unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */
+ unsigned int noise_sensitivity;
+ unsigned int Sharpness;
+ unsigned int static_thresh;
+ unsigned int token_partitions;
+ unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */
+ unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */
+ unsigned int arnr_type; /* alt_ref filter type */
+ unsigned int experimental;
+ vp8e_tuning tuning;
+ unsigned int cq_level; /* constrained quality level */
+ unsigned int rc_max_intra_bitrate_pct;
+
+};
+
+struct extraconfig_map {
+ int usage;
+ struct vp8_extracfg cfg;
+};
+
+static const struct extraconfig_map extracfg_map[] = {
+ {
+ 0,
+ {
+ NULL,
+ VP8_BEST_QUALITY_ENCODING, /* Encoding Mode */
+ 0, /* cpu_used */
+ 0, /* enable_auto_alt_ref */
+ 0, /* noise_sensitivity */
+ 0, /* Sharpness */
+ 0, /* static_thresh */
+ VP8_ONE_TOKENPARTITION, /* token_partitions */
+ 0, /* arnr_max_frames */
+ 3, /* arnr_strength */
+ 3, /* arnr_type*/
+ 0, /* experimental mode */
+ 0, /* tuning*/
+ 10, /* cq_level */
+ 0, /* rc_max_intra_bitrate_pct */
+ }
+ }
+};
+
+struct vpx_codec_alg_priv {
+ vpx_codec_priv_t base;
+ vpx_codec_enc_cfg_t cfg;
+ struct vp8_extracfg vp8_cfg;
+ VP9_CONFIG oxcf;
+ VP9_PTR cpi;
+ unsigned char *cx_data;
+ unsigned int cx_data_sz;
+ vpx_image_t preview_img;
+ unsigned int next_frame_flag;
+ vp8_postproc_cfg_t preview_ppcfg;
+ vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed
+ int deprecated_mode;
+ unsigned int fixed_kf_cntr;
+};
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error) {
+ vpx_codec_err_t res;
+
+ if ((res = error->error_code))
+ ctx->base.err_detail = error->has_detail
+ ? error->detail
+ : NULL;
+
+ return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+ ctx->base.err_detail = str;\
+ return VPX_CODEC_INVALID_PARAM;\
+ } while(0)
+
+#define RANGE_CHECK(p,memb,lo,hi) do {\
+ if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+ ERROR(#memb " out of range ["#lo".."#hi"]");\
+ } while(0)
+
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+ if(!((p)->memb <= (hi))) \
+ ERROR(#memb " out of range [.."#hi"]");\
+ } while(0)
+
+#define RANGE_CHECK_LO(p,memb,lo) do {\
+ if(!((p)->memb >= (lo))) \
+ ERROR(#memb " out of range ["#lo"..]");\
+ } while(0)
+
+#define RANGE_CHECK_BOOL(p,memb) do {\
+ if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+ } while(0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
+ const vpx_codec_enc_cfg_t *cfg,
+ const struct vp8_extracfg *vp8_cfg) {
+ RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */
+ RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */
+ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+ RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+ RANGE_CHECK_HI(cfg, g_profile, 3);
+ RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+ RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+ RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+ RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ);
+ RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
+ RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+ RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+ RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
+ // RANGE_CHECK_BOOL(cfg, g_delete_firstpassfile);
+ RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+ RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+ RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
+ RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+ RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+
+ /* VP8 does not support a lower bound on the keyframe interval in
+ * automatic keyframe placement mode.
+ */
+ if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
+ && cfg->kf_min_dist > 0)
+ ERROR("kf_min_dist not supported in auto mode, use 0 "
+ "or kf_max_dist instead.");
+
+ RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref);
+ RANGE_CHECK(vp8_cfg, cpu_used, -16, 16);
+
+ RANGE_CHECK(vp8_cfg, encoding_mode, VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
+ RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6);
+
+ RANGE_CHECK(vp8_cfg, token_partitions, VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
+ RANGE_CHECK_HI(vp8_cfg, Sharpness, 7);
+ RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+ RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6);
+ RANGE_CHECK(vp8_cfg, arnr_type, 1, 3);
+ RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+
+ if (cfg->g_pass == VPX_RC_LAST_PASS) {
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
+ FIRSTPASS_STATS *stats;
+
+ if (!cfg->rc_twopass_stats_in.buf)
+ ERROR("rc_twopass_stats_in.buf not set.");
+
+ if (cfg->rc_twopass_stats_in.sz % packet_sz)
+ ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+ if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+ ERROR("rc_twopass_stats_in requires at least two packets.");
+
+ stats = (void *)((char *)cfg->rc_twopass_stats_in.buf
+ + (n_packets - 1) * packet_sz);
+
+ if ((int)(stats->count + 0.5) != n_packets - 1)
+ ERROR("rc_twopass_stats_in missing EOS stats packet");
+ }
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img) {
+ switch (img->fmt) {
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_VPXI420:
+ case VPX_IMG_FMT_VPXYV12:
+ break;
+ default:
+ ERROR("Invalid image format. Only YV12 and I420 images are supported");
+ }
+
+ if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
+ ERROR("Image size must match encoder init configuration size");
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
+ vpx_codec_enc_cfg_t cfg,
+ struct vp8_extracfg vp8_cfg) {
+ oxcf->Version = cfg.g_profile;
+ oxcf->Version |= vp8_cfg.experimental ? 0x4 : 0;
+
+ oxcf->Width = cfg.g_w;
+ oxcf->Height = cfg.g_h;
+ /* guess a frame rate if out of whack, use 30 */
+ oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+
+ if (oxcf->frame_rate > 180) {
+ oxcf->frame_rate = 30;
+ }
+
+ switch (cfg.g_pass) {
+ case VPX_RC_ONE_PASS:
+ oxcf->Mode = MODE_BESTQUALITY;
+ break;
+ case VPX_RC_FIRST_PASS:
+ oxcf->Mode = MODE_FIRSTPASS;
+ break;
+ case VPX_RC_LAST_PASS:
+ oxcf->Mode = MODE_SECONDPASS_BEST;
+ break;
+ }
+
+ if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+ oxcf->allow_lag = 0;
+ oxcf->lag_in_frames = 0;
+ } else {
+ oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
+ oxcf->lag_in_frames = cfg.g_lag_in_frames;
+ }
+
+ // VBR only supported for now.
+ // CBR code has been deprectated for experimental phase.
+ // CQ mode not yet tested
+ oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+ /*if (cfg.rc_end_usage == VPX_CQ)
+ oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+ else
+ oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;*/
+
+ oxcf->target_bandwidth = cfg.rc_target_bitrate;
+ oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+
+ oxcf->best_allowed_q = cfg.rc_min_quantizer;
+ oxcf->worst_allowed_q = cfg.rc_max_quantizer;
+ oxcf->cq_level = vp8_cfg.cq_level;
+ oxcf->fixed_q = -1;
+
+ oxcf->under_shoot_pct = cfg.rc_undershoot_pct;
+ oxcf->over_shoot_pct = cfg.rc_overshoot_pct;
+
+ oxcf->maximum_buffer_size = cfg.rc_buf_sz;
+ oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
+ oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
+
+ oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
+ oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
+
+ oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO
+ && cfg.kf_min_dist != cfg.kf_max_dist;
+ // oxcf->kf_min_dist = cfg.kf_min_dis;
+ oxcf->key_freq = cfg.kf_max_dist;
+
+ // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
+ // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
+
+ oxcf->cpu_used = vp8_cfg.cpu_used;
+ oxcf->encode_breakout = vp8_cfg.static_thresh;
+ oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
+ oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
+ oxcf->Sharpness = vp8_cfg.Sharpness;
+
+ oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
+ oxcf->output_pkt_list = vp8_cfg.pkt_list;
+
+ oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
+ oxcf->arnr_strength = vp8_cfg.arnr_strength;
+ oxcf->arnr_type = vp8_cfg.arnr_type;
+
+ oxcf->tuning = vp8_cfg.tuning;
+
+#if CONFIG_LOSSLESS
+ oxcf->lossless = cfg.lossless;
+#endif
+
+ /*
+ printf("Current VP8 Settings: \n");
+ printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+ printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+ printf("Sharpness: %d\n", oxcf->Sharpness);
+ printf("cpu_used: %d\n", oxcf->cpu_used);
+ printf("Mode: %d\n", oxcf->Mode);
+ printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file);
+ printf("auto_key: %d\n", oxcf->auto_key);
+ printf("key_freq: %d\n", oxcf->key_freq);
+ printf("end_usage: %d\n", oxcf->end_usage);
+ printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+ printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+ printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+ printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level);
+ printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+ printf("fixed_q: %d\n", oxcf->fixed_q);
+ printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+ printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+ printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias);
+ printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+ printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+ printf("allow_lag: %d\n", oxcf->allow_lag);
+ printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+ printf("play_alternate: %d\n", oxcf->play_alternate);
+ printf("Version: %d\n", oxcf->Version);
+ printf("encode_breakout: %d\n", oxcf->encode_breakout);
+ */
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
+ const vpx_codec_enc_cfg_t *cfg) {
+ vpx_codec_err_t res;
+
+ if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+ ERROR("Cannot change width or height after initialization");
+
+ /* Prevent increasing lag_in_frames. This check is stricter than it needs
+ * to be -- the limit is not increasing past the first lag_in_frames
+ * value, but we don't track the initial config, only the last successful
+ * config.
+ */
+ if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
+ ERROR("Cannot increase lag_in_frames");
+
+ res = validate_config(ctx, cfg, &ctx->vp8_cfg);
+
+ if (!res) {
+ ctx->cfg = *cfg;
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+ vp9_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ return res;
+}
+
+
+int vp9_reverse_trans(int q);
+
+
+static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+ void *arg = va_arg(args, void *);
+
+#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
+
+ if (!arg)
+ return VPX_CODEC_INVALID_PARAM;
+
+ switch (ctrl_id) {
+ MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));
+ MAP(VP8E_GET_LAST_QUANTIZER_64,
+ vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));
+ }
+
+ return VPX_CODEC_OK;
+#undef MAP
+}
+
+
+static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ struct vp8_extracfg xcfg = ctx->vp8_cfg;
+
+#define MAP(id, var) case id: var = CAST(id, args); break;
+
+ switch (ctrl_id) {
+ MAP(VP8E_SET_ENCODING_MODE, ctx->deprecated_mode);
+ MAP(VP8E_SET_CPUUSED, xcfg.cpu_used);
+ MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref);
+ MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity);
+ MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness);
+ MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh);
+ MAP(VP8E_SET_TOKEN_PARTITIONS, xcfg.token_partitions);
+
+ MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames);
+ MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength);
+ MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type);
+ MAP(VP8E_SET_TUNING, xcfg.tuning);
+ MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level);
+ MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
+
+ }
+
+ res = validate_config(ctx, &ctx->cfg, &xcfg);
+
+ if (!res) {
+ ctx->vp8_cfg = xcfg;
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+ vp9_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ return res;
+#undef MAP
+}
+
+
+static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,
+ int experimental) {
+ vpx_codec_err_t res = VPX_DEC_OK;
+ struct vpx_codec_alg_priv *priv;
+ vpx_codec_enc_cfg_t *cfg;
+ unsigned int i;
+
+ VP9_PTR optr;
+
+ if (!ctx->priv) {
+ priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+
+ if (!priv) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ ctx->priv = &priv->base;
+ ctx->priv->sz = sizeof(*ctx->priv);
+ ctx->priv->iface = ctx->iface;
+ ctx->priv->alg_priv = priv;
+ ctx->priv->init_flags = ctx->init_flags;
+
+ if (ctx->config.enc) {
+ /* Update the reference to the config structure to an
+ * internal copy.
+ */
+ ctx->priv->alg_priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ }
+
+ cfg = &ctx->priv->alg_priv->cfg;
+
+ /* Select the extra vp6 configuration table based on the current
+ * usage value. If the current usage value isn't found, use the
+ * values for usage case 0.
+ */
+ for (i = 0;
+ extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
+ i++);
+
+ priv->vp8_cfg = extracfg_map[i].cfg;
+ priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+ priv->vp8_cfg.experimental = experimental;
+
+ priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+
+ if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
+
+ priv->cx_data = malloc(priv->cx_data_sz);
+
+ if (!priv->cx_data) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ priv->deprecated_mode = NO_MODE_SET;
+
+ vp9_initialize_enc();
+
+ res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+
+ if (!res) {
+ set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+ ctx->priv->alg_priv->cfg,
+ ctx->priv->alg_priv->vp8_cfg);
+ optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
+
+ if (!optr)
+ res = VPX_CODEC_MEM_ERROR;
+ else
+ ctx->priv->alg_priv->cpi = optr;
+ }
+ }
+
+ return res;
+}
+
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {
+ return vp8e_common_init(ctx, 0);
+}
+
+
+#if CONFIG_EXPERIMENTAL
+static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {
+ return vp8e_common_init(ctx, 1);
+}
+#endif
+
+
+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
+
+ free(ctx->cx_data);
+ vp9_remove_compressor(&ctx->cpi);
+ free(ctx);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+ yv12->y_width = img->d_w;
+ yv12->y_height = img->d_h;
+ yv12->uv_width = (1 + yv12->y_width) / 2;
+ yv12->uv_height = (1 + yv12->y_height) / 2;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+ yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
+ return res;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ unsigned long deadline) {
+ unsigned int new_qc;
+
+ /* Use best quality mode if no deadline is given. */
+ if (deadline)
+ new_qc = MODE_GOODQUALITY;
+ else
+ new_qc = MODE_BESTQUALITY;
+
+ if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+ new_qc = MODE_FIRSTPASS;
+ else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
+ new_qc = (new_qc == MODE_BESTQUALITY)
+ ? MODE_SECONDPASS_BEST
+ : MODE_SECONDPASS;
+
+ if (ctx->oxcf.Mode != new_qc) {
+ ctx->oxcf.Mode = new_qc;
+ vp9_change_config(ctx->cpi, &ctx->oxcf);
+ }
+}
+
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned long duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned long deadline) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ if (img)
+ res = validate_img(ctx, img);
+
+ pick_quickcompress_mode(ctx, duration, deadline);
+ vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+ /* Handle Flags */
+ if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
+ || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
+ ctx->base.err_detail = "Conflicting flags.";
+ return VPX_CODEC_INVALID_PARAM;
+ }
+
+ if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
+ | VP8_EFLAG_NO_REF_ARF)) {
+ int ref = 7;
+
+ if (flags & VP8_EFLAG_NO_REF_LAST)
+ ref ^= VP9_LAST_FLAG;
+
+ if (flags & VP8_EFLAG_NO_REF_GF)
+ ref ^= VP9_GOLD_FLAG;
+
+ if (flags & VP8_EFLAG_NO_REF_ARF)
+ ref ^= VP9_ALT_FLAG;
+
+ vp9_use_as_reference(ctx->cpi, ref);
+ }
+
+ if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
+ | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
+ | VP8_EFLAG_FORCE_ARF)) {
+ int upd = 7;
+
+ if (flags & VP8_EFLAG_NO_UPD_LAST)
+ upd ^= VP9_LAST_FLAG;
+
+ if (flags & VP8_EFLAG_NO_UPD_GF)
+ upd ^= VP9_GOLD_FLAG;
+
+ if (flags & VP8_EFLAG_NO_UPD_ARF)
+ upd ^= VP9_ALT_FLAG;
+
+ vp9_update_reference(ctx->cpi, upd);
+ }
+
+ if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+ vp9_update_entropy(ctx->cpi, 0);
+ }
+
+ /* Handle fixed keyframe intervals */
+ if (ctx->cfg.kf_mode == VPX_KF_AUTO
+ && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+ if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+ flags |= VPX_EFLAG_FORCE_KF;
+ ctx->fixed_kf_cntr = 1;
+ }
+ }
+
+ /* Initialize the encoder instance on the first frame*/
+ if (!res && ctx->cpi) {
+ unsigned int lib_flags;
+ YV12_BUFFER_CONFIG sd;
+ int64_t dst_time_stamp, dst_end_time_stamp;
+ unsigned long size, cx_data_sz;
+ unsigned char *cx_data;
+
+ /* Set up internal flags */
+ if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+ ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+
+ // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+ // ((VP9_COMP *)ctx->cpi)->output_partition = 1;
+
+ /* Convert API flags to internal codec lib flags */
+ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+ /* vp8 use 10,000,000 ticks/second as time stamp */
+ dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+ dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+
+ if (img != NULL) {
+ res = image2yuvconfig(img, &sd);
+
+ if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+ &sd, dst_time_stamp, dst_end_time_stamp)) {
+ VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+ res = update_error_state(ctx, &cpi->common.error);
+ }
+
+ /* reset for next frame */
+ ctx->next_frame_flag = 0;
+ }
+
+ cx_data = ctx->cx_data;
+ cx_data_sz = ctx->cx_data_sz;
+ lib_flags = 0;
+
+ while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+ -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+ cx_data, &dst_time_stamp,
+ &dst_end_time_stamp, !img)) {
+ if (size) {
+ vpx_codec_pts_t round, delta;
+ vpx_codec_cx_pkt_t pkt;
+ VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+
+ /* Add the frame packet to the list of returned packets. */
+ round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
+ delta = (dst_end_time_stamp - dst_time_stamp);
+ pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+ pkt.data.frame.pts =
+ (dst_time_stamp * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000;
+ pkt.data.frame.duration =
+ (delta * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000;
+ pkt.data.frame.flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY)
+ pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
+
+ if (!cpi->common.show_frame) {
+ pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
+
+ // This timestamp should be as close as possible to the
+ // prior PTS so that if a decoder uses pts to schedule when
+ // to do this, we start right after last frame was decoded.
+ // Invisible frames have no duration.
+ pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+ * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000) + 1;
+ pkt.data.frame.duration = 0;
+ }
+
+ if (cpi->droppable)
+ pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+
+ /*if (cpi->output_partition)
+ {
+ int i;
+ const int num_partitions = 1;
+
+ pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
+
+ for (i = 0; i < num_partitions; ++i)
+ {
+ pkt.data.frame.buf = cx_data;
+ pkt.data.frame.sz = cpi->partition_sz[i];
+ pkt.data.frame.partition_id = i;
+ // don't set the fragment bit for the last partition
+ if (i == (num_partitions - 1))
+ pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ cx_data += cpi->partition_sz[i];
+ cx_data_sz -= cpi->partition_sz[i];
+ }
+ }
+ else*/
+ {
+ pkt.data.frame.buf = cx_data;
+ pkt.data.frame.sz = size;
+ pkt.data.frame.partition_id = -1;
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ cx_data += size;
+ cx_data_sz -= size;
+ }
+
+ // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
+ }
+ }
+ }
+
+ return res;
+}
+
+
+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_iter_t *iter) {
+ return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data) {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data) {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+#if CONFIG_POSTPROC
+ vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+ (void)ctr_id;
+
+ if (data) {
+ ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+#else
+ (void)ctx;
+ (void)ctr_id;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {
+
+ YV12_BUFFER_CONFIG sd;
+ vp9_ppflags_t flags = {0};
+
+ if (ctx->preview_ppcfg.post_proc_flag) {
+ flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag;
+ flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
+ flags.noise_level = ctx->preview_ppcfg.noise_level;
+ }
+
+ if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
+
+ /*
+ vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
+ sd.y_width + 2*VP8BORDERINPIXELS,
+ sd.y_height + 2*VP8BORDERINPIXELS,
+ 1,
+ sd.buffer_alloc);
+ vpx_img_set_rect(&ctx->preview_img,
+ VP8BORDERINPIXELS, VP8BORDERINPIXELS,
+ sd.y_width, sd.y_height);
+ */
+
+ ctx->preview_img.bps = 12;
+ ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
+ ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
+ ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
+
+ if (sd.clrtype == REG_YUV)
+ ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+ else
+ ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
+ ctx->preview_img.x_chroma_shift = 1;
+ ctx->preview_img.y_chroma_shift = 1;
+
+ ctx->preview_img.d_w = sd.y_width;
+ ctx->preview_img.d_h = sd.y_height;
+ ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
+ ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
+ ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
+ ctx->preview_img.w = sd.y_width;
+ ctx->preview_img.h = sd.y_height;
+
+ return &ctx->preview_img;
+ } else
+ return NULL;
+}
+
+static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ int update = va_arg(args, int);
+ vp9_update_entropy(ctx->cpi, update);
+ return VPX_CODEC_OK;
+
+}
+
+static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ int update = va_arg(args, int);
+ vp9_update_reference(ctx->cpi, update);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ int reference_flag = va_arg(args, int);
+ vp9_use_as_reference(ctx->cpi, reference_flag);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+ if (data) {
+ vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+ if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+ roi->delta_q, roi->delta_lf, roi->static_threshold))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+ vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
+
+ if (data) {
+
+ vpx_active_map_t *map = (vpx_active_map_t *)data;
+
+ if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+
+ vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *);
+
+ if (data) {
+ int res;
+ vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
+ res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+ scalemode.v_scaling_mode);
+
+ if (!res) {
+ /*force next frame a key frame to effect scaling mode */
+ ctx->next_frame_flag |= FRAMEFLAGS_KEY;
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
+ {VP8_SET_REFERENCE, vp8e_set_reference},
+ {VP8_COPY_REFERENCE, vp8e_get_reference},
+ {VP8_SET_POSTPROC, vp8e_set_previewpp},
+ {VP8E_UPD_ENTROPY, vp8e_update_entropy},
+ {VP8E_UPD_REFERENCE, vp8e_update_reference},
+ {VP8E_USE_REFERENCE, vp8e_use_reference},
+ {VP8E_SET_ROI_MAP, vp8e_set_roi_map},
+ {VP8E_SET_ACTIVEMAP, vp8e_set_activemap},
+ {VP8E_SET_SCALEMODE, vp8e_set_scalemode},
+ {VP8E_SET_ENCODING_MODE, set_param},
+ {VP8E_SET_CPUUSED, set_param},
+ {VP8E_SET_NOISE_SENSITIVITY, set_param},
+ {VP8E_SET_ENABLEAUTOALTREF, set_param},
+ {VP8E_SET_SHARPNESS, set_param},
+ {VP8E_SET_STATIC_THRESHOLD, set_param},
+ {VP8E_SET_TOKEN_PARTITIONS, set_param},
+ {VP8E_GET_LAST_QUANTIZER, get_param},
+ {VP8E_GET_LAST_QUANTIZER_64, get_param},
+ {VP8E_SET_ARNR_MAXFRAMES, set_param},
+ {VP8E_SET_ARNR_STRENGTH, set_param},
+ {VP8E_SET_ARNR_TYPE, set_param},
+ {VP8E_SET_TUNING, set_param},
+ {VP8E_SET_CQ_LEVEL, set_param},
+ {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param},
+ { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
+ {
+ 0,
+ {
+ 0, /* g_usage */
+ 0, /* g_threads */
+ 0, /* g_profile */
+
+ 320, /* g_width */
+ 240, /* g_height */
+ {1, 30}, /* g_timebase */
+
+ 0, /* g_error_resilient */
+
+ VPX_RC_ONE_PASS, /* g_pass */
+
+ 0, /* g_lag_in_frames */
+
+ 0, /* rc_dropframe_thresh */
+ 0, /* rc_resize_allowed */
+ 60, /* rc_resize_down_thresold */
+ 30, /* rc_resize_up_thresold */
+
+ VPX_VBR, /* rc_end_usage */
+#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
+ {0}, /* rc_twopass_stats_in */
+#endif
+ 256, /* rc_target_bandwidth */
+ 4, /* rc_min_quantizer */
+ 63, /* rc_max_quantizer */
+ 100, /* rc_undershoot_pct */
+ 100, /* rc_overshoot_pct */
+
+ 6000, /* rc_max_buffer_size */
+ 4000, /* rc_buffer_initial_size; */
+ 5000, /* rc_buffer_optimal_size; */
+
+ 50, /* rc_two_pass_vbrbias */
+ 0, /* rc_two_pass_vbrmin_section */
+ 400, /* rc_two_pass_vbrmax_section */
+
+ /* keyframing settings (kf) */
+ VPX_KF_AUTO, /* g_kfmode*/
+ 0, /* kf_min_dist */
+ 9999, /* kf_max_dist */
+
+#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
+ 1, /* g_delete_first_pass_file */
+ "vp8.fpf" /* first pass filename */
+#endif
+ }
+ },
+ { -1, {NOT_IMPLEMENTED}}
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_cx) = {
+ "WebM Project VP8 Encoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
+ VPX_CODEC_CAP_OUTPUT_PARTITION,
+ /* vpx_codec_caps_t caps; */
+ vp8e_init, /* vpx_codec_init_fn_t init; */
+ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
+ NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ {
+ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
+ vp8e_encode, /* vpx_codec_encode_fn_t encode; */
+ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
+ vp8e_set_config,
+ NOT_IMPLEMENTED,
+ vp8e_get_preview,
+ } /* encoder functions */
+};
+
+
+#if CONFIG_EXPERIMENTAL
+
+CODEC_INTERFACE(vpx_codec_vp8x_cx) = {
+ "VP8 Experimental Encoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
+ /* vpx_codec_caps_t caps; */
+ vp8e_exp_init, /* vpx_codec_init_fn_t init; */
+ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
+ NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ {
+ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
+ vp8e_encode, /* vpx_codec_encode_fn_t encode; */
+ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
+ vp8e_set_config,
+ NOT_IMPLEMENTED,
+ vp8e_get_preview,
+ } /* encoder functions */
+};
+#endif
+
+
+/*
+ * BEGIN BACKWARDS COMPATIBILITY SHIM.
+ */
+#define FORCE_KEY 2
+static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+ vpx_codec_ctrl_fn_map_t *entry;
+
+ switch (ctrl_id) {
+ case VP8E_SET_FLUSHFLAG:
+ /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by
+ * vpx_codec_get_cx_data() rather than vpx_codec_encode().
+ */
+ return vp8e_encode(ctx, NULL, 0, 0, 0, 0);
+ case VP8E_SET_FRAMETYPE:
+ ctx->base.enc.tbd |= FORCE_KEY;
+ return VPX_CODEC_OK;
+ }
+
+ for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {
+ if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+ return entry->fn(ctx, ctrl_id, args);
+ }
+ }
+
+ return VPX_CODEC_ERROR;
+}
+
+
+static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {
+ {0, api1_control},
+ { -1, NULL}
+};
+
+
+static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned long duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned long deadline) {
+ int force = ctx->base.enc.tbd;
+
+ ctx->base.enc.tbd = 0;
+ return vp8e_encode
+ (ctx,
+ img,
+ pts,
+ duration,
+ flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),
+ deadline);
+}
+
+
+vpx_codec_iface_t vpx_enc_vp8_algo = {
+ "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_ENCODER,
+ /* vpx_codec_caps_t caps; */
+ vp8e_init, /* vpx_codec_init_fn_t init; */
+ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ api1_ctrl_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {NOT_IMPLEMENTED}, /* decoder functions */
+ {
+ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
+ api1_encode, /* vpx_codec_encode_fn_t encode; */
+ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
+ vp8e_set_config,
+ NOT_IMPLEMENTED,
+ vp8e_get_preview,
+ } /* encoder functions */
+};
--- /dev/null
+++ b/vp9/vp9_dx_iface.c
@@ -1,0 +1,717 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+typedef vpx_codec_stream_info_t vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum {
+ VP8_SEG_ALG_PRIV = 256,
+ VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+
+typedef struct {
+ unsigned int id;
+ unsigned long sz;
+ unsigned int align;
+ unsigned int flags;
+ unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+static const mem_req_t vp8_mem_req_segs[] = {
+ {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
+ {VP8_SEG_MAX, 0, 0, 0, NULL}
+};
+
+struct vpx_codec_alg_priv {
+ vpx_codec_priv_t base;
+ vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
+ vpx_codec_dec_cfg_t cfg;
+ vp8_stream_info_t si;
+ int defer_alloc;
+ int decoder_init;
+ VP9D_PTR pbi;
+ int postproc_cfg_set;
+ vp8_postproc_cfg_t postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+ unsigned int dbg_postproc_flag;
+ int dbg_color_ref_frame_flag;
+ int dbg_color_mb_modes_flag;
+ int dbg_color_b_modes_flag;
+ int dbg_display_mv_flag;
+#endif
+ vpx_image_t img;
+ int img_setup;
+ int img_avail;
+};
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
+ vpx_codec_flags_t flags) {
+ /* Although this declaration is constant, we can't use it in the requested
+ * segments list because we want to define the requested segments list
+ * before defining the private type (so that the number of memory maps is
+ * known)
+ */
+ (void)si;
+ return sizeof(vpx_codec_alg_priv_t);
+}
+
+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
+ free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
+ vpx_codec_err_t res;
+ unsigned int align;
+
+ align = mmap->align ? mmap->align - 1 : 0;
+
+ if (mmap->flags & VPX_CODEC_MEM_ZERO)
+ mmap->priv = calloc(1, mmap->sz + align);
+ else
+ mmap->priv = malloc(mmap->sz + align);
+
+ res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+ mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+ mmap->dtor = vp8_mmap_dtor;
+ return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+ const vpx_codec_mmap_t *mmaps,
+ vpx_codec_flags_t init_flags) {
+ int i;
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
+ /* Ensure the segment has been allocated */
+ if (!mmaps[i].base) {
+ res = VPX_CODEC_MEM_ERROR;
+ break;
+ }
+
+ /* Verify variable size segment is big enough for the current si. */
+ if (vp8_mem_req_segs[i].calc_sz) {
+ vpx_codec_dec_cfg_t cfg;
+
+ cfg.w = si->w;
+ cfg.h = si->h;
+
+ if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
+ res = VPX_CODEC_MEM_ERROR;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
+ int i;
+
+ ctx->priv = mmap->base;
+ ctx->priv->sz = sizeof(*ctx->priv);
+ ctx->priv->iface = ctx->iface;
+ ctx->priv->alg_priv = mmap->base;
+
+ for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+ ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+
+ ctx->priv->alg_priv->mmaps[0] = *mmap;
+ ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+ ctx->priv->init_flags = ctx->init_flags;
+
+ if (ctx->config.dec) {
+ /* Update the reference to the config structure to an internal copy. */
+ ctx->priv->alg_priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ }
+}
+
+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
+ int i;
+
+ for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+ if (ctx->mmaps[i].id == id)
+ return ctx->mmaps[i].base;
+
+ return NULL;
+}
+static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
+ /* nothing to clean up */
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ /* This function only allocates space for the vpx_codec_alg_priv_t
+ * structure. More memory may be required at the time the stream
+ * information becomes known.
+ */
+ if (!ctx->priv) {
+ vpx_codec_mmap_t mmap;
+
+ mmap.id = vp8_mem_req_segs[0].id;
+ mmap.sz = sizeof(vpx_codec_alg_priv_t);
+ mmap.align = vp8_mem_req_segs[0].align;
+ mmap.flags = vp8_mem_req_segs[0].flags;
+
+ res = vp8_mmap_alloc(&mmap);
+
+ if (!res) {
+ vp8_init_ctx(ctx, &mmap);
+
+ ctx->priv->alg_priv->defer_alloc = 1;
+ /*post processing level initialized to do nothing */
+ }
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
+ int i;
+
+ vp9_remove_decompressor(ctx->pbi);
+
+ for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {
+ if (ctx->mmaps[i].dtor)
+ ctx->mmaps[i].dtor(&ctx->mmaps[i]);
+ }
+
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+ unsigned int data_sz,
+ vpx_codec_stream_info_t *si) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ if (data + data_sz <= data)
+ res = VPX_CODEC_INVALID_PARAM;
+ else {
+ /* Parse uncompresssed part of key frame header.
+ * 3 bytes:- including version, frame type and an offset
+ * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+ * 4 bytes:- including image width and height in the lowest 14 bits
+ * of each 2-byte value.
+ */
+ si->is_kf = 0;
+
+ if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
+ const uint8_t *c = data + 3;
+ si->is_kf = 1;
+
+ /* vet via sync code */
+ if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+
+ si->w = (c[3] | (c[4] << 8)) & 0x3fff;
+ si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+
+ /*printf("w=%d, h=%d\n", si->w, si->h);*/
+ if (!(si->h | si->w))
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ } else
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ }
+
+ return res;
+
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_stream_info_t *si) {
+
+ unsigned int sz;
+
+ if (si->sz >= sizeof(vp8_stream_info_t))
+ sz = sizeof(vp8_stream_info_t);
+ else
+ sz = sizeof(vpx_codec_stream_info_t);
+
+ memcpy(si, &ctx->si, sz);
+ si->sz = sz;
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error) {
+ vpx_codec_err_t res;
+
+ if ((res = error->error_code))
+ ctx->base.err_detail = error->has_detail
+ ? error->detail
+ : NULL;
+
+ return res;
+}
+
+static void yuvconfig2image(vpx_image_t *img,
+ const YV12_BUFFER_CONFIG *yv12,
+ void *user_priv) {
+ /** vpx_img_wrap() doesn't allow specifying independent strides for
+ * the Y, U, and V planes, nor other alignment adjustments that
+ * might be representable by a YV12_BUFFER_CONFIG, so we just
+ * initialize all the fields.*/
+ img->fmt = yv12->clrtype == REG_YUV ?
+ VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+ img->w = yv12->y_stride;
+ img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+ img->d_w = yv12->y_width;
+ img->d_h = yv12->y_height;
+ img->x_chroma_shift = 1;
+ img->y_chroma_shift = 1;
+ img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+ img->planes[VPX_PLANE_U] = yv12->u_buffer;
+ img->planes[VPX_PLANE_V] = yv12->v_buffer;
+ img->planes[VPX_PLANE_ALPHA] = NULL;
+ img->stride[VPX_PLANE_Y] = yv12->y_stride;
+ img->stride[VPX_PLANE_U] = yv12->uv_stride;
+ img->stride[VPX_PLANE_V] = yv12->uv_stride;
+ img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+ img->bps = 12;
+ img->user_priv = user_priv;
+ img->img_data = yv12->buffer_alloc;
+ img->img_data_owner = 0;
+ img->self_allocd = 0;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
+ const uint8_t *data,
+ unsigned int data_sz,
+ void *user_priv,
+ long deadline) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ ctx->img_avail = 0;
+
+ /* Determine the stream parameters. Note that we rely on peek_si to
+ * validate that we have a buffer that does not wrap around the top
+ * of the heap.
+ */
+ if (!ctx->si.h)
+ res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+
+
+ /* Perform deferred allocations, if required */
+ if (!res && ctx->defer_alloc) {
+ int i;
+
+ for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {
+ vpx_codec_dec_cfg_t cfg;
+
+ cfg.w = ctx->si.w;
+ cfg.h = ctx->si.h;
+ ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
+ ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
+ ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
+ ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+
+ if (!ctx->mmaps[i].sz)
+ ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+ ctx->base.init_flags);
+
+ res = vp8_mmap_alloc(&ctx->mmaps[i]);
+ }
+
+ if (!res)
+ vp8_finalize_mmaps(ctx);
+
+ ctx->defer_alloc = 0;
+ }
+
+ /* Initialize the decoder instance on the first frame*/
+ if (!res && !ctx->decoder_init) {
+ res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+
+ if (!res) {
+ VP9D_CONFIG oxcf;
+ VP9D_PTR optr;
+
+ vp9_initialize_dec();
+
+ oxcf.Width = ctx->si.w;
+ oxcf.Height = ctx->si.h;
+ oxcf.Version = 9;
+ oxcf.postprocess = 0;
+ oxcf.max_threads = ctx->cfg.threads;
+ optr = vp9_create_decompressor(&oxcf);
+
+ /* If postprocessing was enabled by the application and a
+ * configuration has not been provided, default it.
+ */
+ if (!ctx->postproc_cfg_set
+ && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+ ctx->postproc_cfg.post_proc_flag =
+ VP8_DEBLOCK | VP8_DEMACROBLOCK;
+ ctx->postproc_cfg.deblocking_level = 4;
+ ctx->postproc_cfg.noise_level = 0;
+ }
+
+ if (!optr)
+ res = VPX_CODEC_ERROR;
+ else
+ ctx->pbi = optr;
+ }
+
+ ctx->decoder_init = 1;
+ }
+
+ if (!res && ctx->pbi) {
+ YV12_BUFFER_CONFIG sd;
+ int64_t time_stamp = 0, time_end_stamp = 0;
+ vp9_ppflags_t flags = {0};
+
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
+ flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+ | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
+ | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)
+#endif
+;
+ flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+ flags.noise_level = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+ flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
+ flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+ flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+ flags.display_mv_flag = ctx->dbg_display_mv_flag;
+#endif
+ }
+
+ if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {
+ VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+ res = update_error_state(ctx, &pbi->common.error);
+ }
+
+ if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,
+ &time_end_stamp, &flags)) {
+ yuvconfig2image(&ctx->img, &sd, user_priv);
+ ctx->img_avail = 1;
+ }
+ }
+
+ return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_iter_t *iter) {
+ vpx_image_t *img = NULL;
+
+ if (ctx->img_avail) {
+ /* iter acts as a flip flop, so an image is only returned on the first
+ * call to get_frame.
+ */
+ if (!(*iter)) {
+ img = &ctx->img;
+ *iter = img;
+ }
+ }
+
+ return img;
+}
+
+
+static
+vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
+ vpx_codec_mmap_t *mmap,
+ vpx_codec_iter_t *iter) {
+ vpx_codec_err_t res;
+ const mem_req_t *seg_iter = *iter;
+
+ /* Get address of next segment request */
+ do {
+ if (!seg_iter)
+ seg_iter = vp8_mem_req_segs;
+ else if (seg_iter->id != VP8_SEG_MAX)
+ seg_iter++;
+
+ *iter = (vpx_codec_iter_t)seg_iter;
+
+ if (seg_iter->id != VP8_SEG_MAX) {
+ mmap->id = seg_iter->id;
+ mmap->sz = seg_iter->sz;
+ mmap->align = seg_iter->align;
+ mmap->flags = seg_iter->flags;
+
+ if (!seg_iter->sz)
+ mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
+
+ res = VPX_CODEC_OK;
+ } else
+ res = VPX_CODEC_LIST_END;
+ } while (!mmap->sz && res != VPX_CODEC_LIST_END);
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
+ const vpx_codec_mmap_t *mmap) {
+ vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
+ int i, done;
+
+ if (!ctx->priv) {
+ if (mmap->id == VP8_SEG_ALG_PRIV) {
+ if (!ctx->priv) {
+ vp8_init_ctx(ctx, mmap);
+ res = VPX_CODEC_OK;
+ }
+ }
+ }
+
+ done = 1;
+
+ if (!res && ctx->priv->alg_priv) {
+ for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {
+ if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
+ if (!ctx->priv->alg_priv->mmaps[i].base) {
+ ctx->priv->alg_priv->mmaps[i] = *mmap;
+ res = VPX_CODEC_OK;
+ }
+
+ done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
+ }
+ }
+
+ if (done && !res) {
+ vp8_finalize_mmaps(ctx->priv->alg_priv);
+ res = ctx->iface->init(ctx);
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12) {
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+ yv12->y_width = img->d_w;
+ yv12->y_height = img->d_h;
+ yv12->uv_width = yv12->y_width / 2;
+ yv12->uv_height = yv12->y_height / 2;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+ yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
+ img->fmt == VPX_IMG_FMT_VPXYV12);
+
+ return res;
+}
+
+
+static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data) {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data) {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
+#if CONFIG_POSTPROC
+ vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+ if (data) {
+ ctx->postproc_cfg_set = 1;
+ ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+ switch (ctrl_id) {
+ MAP(VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag);
+ MAP(VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag);
+ MAP(VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag);
+ MAP(VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag);
+ }
+
+ return VPX_CODEC_OK;
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+ int *update_info = va_arg(args, int *);
+ VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+
+ if (update_info) {
+ *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+ + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+ + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
+
+ int *corrupted = va_arg(args, int *);
+
+ if (corrupted) {
+ VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+ *corrupted = pbi->common.frame_to_show->corrupted;
+
+ return VPX_CODEC_OK;
+ } else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
+ {VP8_SET_REFERENCE, vp9_set_reference},
+ {VP8_COPY_REFERENCE, vp9_get_reference},
+ {VP8_SET_POSTPROC, vp8_set_postproc},
+ {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options},
+ {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
+ {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
+ { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) = {
+ "WebM Project VP8 Decoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |
+ VPX_CODEC_CAP_INPUT_PARTITION,
+ /* vpx_codec_caps_t caps; */
+ vp8_init, /* vpx_codec_init_fn_t init; */
+ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
+ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
+ vp8_decode, /* vpx_codec_decode_fn_t decode; */
+ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ {
+ /* encoder functions */
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED
+ }
+};
+
+/*
+ * BEGIN BACKWARDS COMPATIBILITY SHIM.
+ */
+vpx_codec_iface_t vpx_codec_vp8_algo = {
+ "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
+ /* vpx_codec_caps_t caps; */
+ vp8_init, /* vpx_codec_init_fn_t init; */
+ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
+ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
+ vp8_decode, /* vpx_codec_decode_fn_t decode; */
+ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ {
+ /* encoder functions */
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED
+ }
+};
--- /dev/null
+++ b/vp9/vp9cx.mk
@@ -1,0 +1,120 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+
+VP9_CX_EXPORTS += exports_enc
+
+VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_CX_SRCS-no += $(VP9_COMMON_SRCS-no)
+VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx_arm.mk
+endif
+
+VP9_CX_SRCS-yes += vp9_cx_iface.c
+
+# encoder
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += algo/vpx_ref/cpu_id/include
+#INCLUDES += common
+#INCLUDES += encoder
+
+VP9_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP9_CX_SRCS-yes += encoder/bitstream.c
+VP9_CX_SRCS-yes += encoder/boolhuff.c
+VP9_CX_SRCS-yes += encoder/dct.c
+VP9_CX_SRCS-yes += encoder/encodeframe.c
+VP9_CX_SRCS-yes += encoder/encodeintra.c
+VP9_CX_SRCS-yes += encoder/encodemb.c
+VP9_CX_SRCS-yes += encoder/encodemv.c
+VP9_CX_SRCS-yes += encoder/firstpass.c
+VP9_CX_SRCS-yes += encoder/generic/csystemdependent.c
+VP9_CX_SRCS-yes += encoder/block.h
+VP9_CX_SRCS-yes += encoder/boolhuff.h
+VP9_CX_SRCS-yes += encoder/bitstream.h
+VP9_CX_SRCS-yes += encoder/encodeintra.h
+VP9_CX_SRCS-yes += encoder/encodemb.h
+VP9_CX_SRCS-yes += encoder/encodemv.h
+VP9_CX_SRCS-yes += encoder/firstpass.h
+VP9_CX_SRCS-yes += encoder/lookahead.c
+VP9_CX_SRCS-yes += encoder/lookahead.h
+VP9_CX_SRCS-yes += encoder/mcomp.h
+VP9_CX_SRCS-yes += encoder/modecosts.h
+VP9_CX_SRCS-yes += encoder/onyx_int.h
+VP9_CX_SRCS-yes += encoder/psnr.h
+VP9_CX_SRCS-yes += encoder/quantize.h
+VP9_CX_SRCS-yes += encoder/ratectrl.h
+VP9_CX_SRCS-yes += encoder/rdopt.h
+VP9_CX_SRCS-yes += encoder/tokenize.h
+VP9_CX_SRCS-yes += encoder/treewriter.h
+VP9_CX_SRCS-yes += encoder/variance.h
+VP9_CX_SRCS-yes += encoder/mcomp.c
+VP9_CX_SRCS-yes += encoder/modecosts.c
+VP9_CX_SRCS-yes += encoder/onyx_if.c
+VP9_CX_SRCS-yes += encoder/picklpf.c
+VP9_CX_SRCS-yes += encoder/psnr.c
+VP9_CX_SRCS-yes += encoder/quantize.c
+VP9_CX_SRCS-yes += encoder/ratectrl.c
+VP9_CX_SRCS-yes += encoder/rdopt.c
+VP9_CX_SRCS-yes += encoder/sad_c.c
+VP9_CX_SRCS-yes += encoder/satd_c.c
+VP9_CX_SRCS-yes += encoder/segmentation.c
+VP9_CX_SRCS-yes += encoder/segmentation.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
+VP9_CX_SRCS-yes += encoder/tokenize.c
+VP9_CX_SRCS-yes += encoder/treewriter.c
+VP9_CX_SRCS-yes += encoder/variance_c.c
+ifeq ($(CONFIG_POSTPROC),yes)
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+endif
+VP9_CX_SRCS-yes += encoder/temporal_filter.c
+VP9_CX_SRCS-yes += encoder/temporal_filter.h
+VP9_CX_SRCS-yes += encoder/mbgraph.c
+VP9_CX_SRCS-yes += encoder/mbgraph.h
+
+
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+
+
+VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- /dev/null
+++ b/vp9/vp9cx_arm.mk
@@ -1,0 +1,63 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+#VP9_CX_SRCS list is modified according to different platforms.
+
+#File list for arm
+# encoder
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
+
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.h
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/encodemb_arm.h
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.h
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.c
+VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/variance_arm.h
+
+#File list for armv5te
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
+VP9_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
+
+#File list for armv6
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
+
+#File list for neon
+# encoder
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/picklpf_arm.c
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad8_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/shortfdct_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/subtract_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/variance_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
--- /dev/null
+++ b/vp9/vp9dx.mk
@@ -1,0 +1,71 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+
+VP9_DX_EXPORTS += exports_dec
+
+VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_DX_SRCS-no += $(VP9_COMMON_SRCS-no)
+VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_DX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx_arm.mk
+endif
+
+VP9_DX_SRCS-yes += vp9_dx_iface.c
+
+# common
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+
+
+# decoder
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+VP9_DX_SRCS-yes += decoder/asm_dec_offsets.c
+VP9_DX_SRCS-yes += decoder/dboolhuff.c
+VP9_DX_SRCS-yes += decoder/decodemv.c
+VP9_DX_SRCS-yes += decoder/decodframe.c
+VP9_DX_SRCS-yes += decoder/dequantize.c
+VP9_DX_SRCS-yes += decoder/detokenize.c
+VP9_DX_SRCS-yes += decoder/dboolhuff.h
+VP9_DX_SRCS-yes += decoder/decodemv.h
+VP9_DX_SRCS-yes += decoder/dequantize.h
+VP9_DX_SRCS-yes += decoder/detokenize.h
+VP9_DX_SRCS-yes += decoder/onyxd_int.h
+VP9_DX_SRCS-yes += decoder/treereader.h
+VP9_DX_SRCS-yes += decoder/onyxd_if.c
+VP9_DX_SRCS-yes += decoder/idct_blk.c
+
+VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
+
+VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
+VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
--- /dev/null
+++ b/vp9/vp9dx_arm.mk
@@ -1,0 +1,29 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+#VP8_DX_SRCS list is modified according to different platforms.
+
+VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
+
+#File list for armv6
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
+
+#File list for neon
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
+VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -28,8 +28,8 @@
/*!\file
* \brief Provides controls common to both the VP8 encoder and decoder.
*/
-#ifndef VP8_H
-#define VP8_H
+#ifndef VP9_H
+#define VP9_H
#include "vpx_codec_impl_top.h"
/*!\brief Control functions
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -20,8 +20,8 @@
* \brief Provides definitions for using the VP8 encoder algorithm within the
* vpx Codec Interface.
*/
-#ifndef VP8CX_H
-#define VP8CX_H
+#ifndef VP9CX_H
+#define VP9CX_H
#include "vpx_config.h"
#include "vpx_codec_impl_top.h"
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -20,8 +20,8 @@
* \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
* interface.
*/
-#ifndef VP8DX_H
-#define VP8DX_H
+#ifndef VP9DX_H
+#define VP9DX_H
#include "vpx_codec_impl_top.h"
/*!\name Algorithm interface for VP8
--- a/vpx/vp8e.h
+++ b/vpx/vp8e.h
@@ -12,8 +12,8 @@
/* This file contains backwards compatibility stubs for applications using
* the VP8 version 1.0 API.
*/
-#ifndef VP8E_H
-#define VP8E_H
+#ifndef VP9E_H
+#define VP9E_H
#include "vpx_codec_impl_top.h"
#if defined(VPX_CODEC_DISABLE_COMPAT) && VPX_CODEC_DISABLE_COMPAT
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -22,7 +22,7 @@
#include "vpx_config.h"
#include "vpx/vpx_decoder.h"
#include "vpx_ports/vpx_timer.h"
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
#include "vpx/vp8dx.h"
#endif
#if CONFIG_MD5
@@ -56,8 +56,8 @@
unsigned int fourcc;
unsigned int fourcc_mask;
} ifaces[] = {
-#if CONFIG_VP8_DECODER
- {"vp8", vpx_codec_vp8_dx, VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_DECODER
+ {"vp9", vpx_codec_vp8_dx, VP8_FOURCC, 0x00FFFFFF},
#endif
};
@@ -104,7 +104,7 @@
NULL
};
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,
"Enable VP8 postproc add noise");
static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,
@@ -135,7 +135,7 @@
fprintf(stderr, "Usage: %s <options> filename\n\n"
"Options:\n", exec_name);
arg_show_usage(stderr, all_args);
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
fprintf(stderr, "\nVP8 Postprocessing Options:\n");
arg_show_usage(stderr, vp8_pp_args);
#endif
@@ -684,7 +684,7 @@
unsigned int fps_num;
void *out = NULL;
vpx_codec_dec_cfg_t cfg = {0};
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
vp8_postproc_cfg_t vp8_pp_cfg = {0};
int vp8_dbg_color_ref_frame = 0;
int vp8_dbg_color_mb_modes = 0;
@@ -744,7 +744,7 @@
else if (arg_match(&arg, &verbosearg, argi))
quiet = 0;
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
else if (arg_match(&arg, &addnoise_level, argi)) {
postproc = 1;
vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
@@ -909,7 +909,7 @@
if (!quiet)
fprintf(stderr, "%s\n", decoder.name);
-#if CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
if (vp8_pp_cfg.post_proc_flag
&& vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -82,8 +82,8 @@
unsigned int fourcc;
unsigned int fourcc_mask;
} ifaces[] = {
-#if CONFIG_VP8_DECODER
- {"vp8", &vpx_codec_vp8_dx, VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_DECODER
+ {"vp9", &vpx_codec_vp8_dx, VP8_FOURCC, 0x00FFFFFF},
#endif
};
@@ -93,8 +93,8 @@
unsigned int fourcc;
unsigned int fourcc_mask;
} codecs[] = {
-#if CONFIG_VP8_ENCODER
- {"vp8", vpx_codec_vp8x_cx, VP8_FOURCC, 0x00FFFFFF},
+#if CONFIG_VP9_ENCODER
+ {"vp9", vpx_codec_vp8x_cx, VP8_FOURCC, 0x00FFFFFF},
#endif
};
@@ -1011,7 +1011,7 @@
};
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
"Noise sensitivity (frames to blur)");
static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
@@ -1020,13 +1020,13 @@
"Motion detection threshold");
#endif
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
"CPU Used (-16..16)");
#endif
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,
"Number of token partitions to use, log2");
static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
@@ -1081,7 +1081,7 @@
arg_show_usage(stdout, rc_twopass_args);
fprintf(stderr, "\nKeyframe Placement Options:\n");
arg_show_usage(stdout, kf_args);
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
fprintf(stderr, "\nVP8 Specific Options:\n");
arg_show_usage(stdout, vp8_args);
#endif
@@ -1659,7 +1659,7 @@
#endif
/* Handle codec specific options */
-#if CONFIG_VP8_ENCODER
+#if CONFIG_VP9_ENCODER
if (codec->fourcc == VP8_FOURCC) {
ctrl_args = vp8_args;