ref: 1ea4c2924c6595a542713bbf115965b6e8ddabfe
parent: 9857ebce96e9c4d92b961f676f68ae681ab8efa2
parent: 692b10858d8be54706f3e02ba3075c9718d7d683
author: John Koleszar <jkoleszar@google.com>
date: Thu Nov 11 04:22:46 EST 2010
Merge remote branch 'internal/upstream' into HEAD Conflicts: configure Change-Id: I1c7bae5241f999387cae3f2abf2dfc84fe3f6651
--- a/configure
+++ b/configure
@@ -41,6 +41,7 @@
${toggle_shared} shared library support
${toggle_small} favor smaller size over speed
${toggle_arm_asm_detok} assembly version of the detokenizer (ARM platforms only)
+ ${toggle_postproc_visualizer} macro block / block level visualizers
Codecs:
Codecs can be selectively enabled or disabled individually, or by family:
@@ -252,6 +253,7 @@
shared
small
arm_asm_detok
+ postproc_visualizer
experimental
${EXPERIMENT_LIST}
@@ -294,6 +296,8 @@
shared
small
arm_asm_detok
+ postproc_visualizer
+
experimental
"
@@ -343,8 +347,6 @@
for c in ${CODECS}; do
enabled ${c} && enable ${c##*_}s
done
-
-
}
@@ -554,6 +556,10 @@
# Other toolchain specific defaults
case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+
+ if enabled postproc_visualizer; then
+ enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+ fi
}
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -204,7 +204,7 @@
// and not just a copy of the pointer..
int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
- int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
+ int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);
int vp8_update_reference(VP8_PTR comp, int ref_frame_flags);
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -51,7 +51,7 @@
int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
- int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level, int noise_level, int flags);
+ int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);
int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -26,7 +26,7 @@
( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
/* global constants */
-
+#if CONFIG_POSTPROC_VISUALIZER
static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
{
{ RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
@@ -59,7 +59,7 @@
{ RGB_TO_YUV(0xccff33) }, /* Yellow */
};
-static const unsigned char MV_REFERENCE_FRAME_colors[MB_MODE_COUNT][3] =
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
{
{ RGB_TO_YUV(0x00ff00) }, /* Blue */
{ RGB_TO_YUV(0x0000ff) }, /* Green */
@@ -66,6 +66,7 @@
{ RGB_TO_YUV(0xffff00) }, /* Yellow */
{ RGB_TO_YUV(0xff0000) }, /* Red */
};
+#endif
static const short kernel5[] =
{
@@ -677,10 +678,13 @@
#define RTCD_VTABLE(oci) NULL
#endif
-int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
{
char message[512];
int q = oci->filter_level * 10 / 6;
+ int flags = ppflags->post_proc_flag;
+ int deblock_level = ppflags->deblocking_level;
+ int noise_level = ppflags->noise_level;
if (!oci->frame_to_show)
return -1;
@@ -737,7 +741,8 @@
oci->post_proc_buffer.y_stride);
}
- if (flags & VP8D_DEBUG_LEVEL1)
+#if CONFIG_POSTPROC_VISUALIZER
+ if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
{
sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
(oci->frame_type == KEY_FRAME),
@@ -749,7 +754,7 @@
vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
}
- if (flags & VP8D_DEBUG_LEVEL2)
+ if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
{
int i, j;
unsigned char *y_ptr;
@@ -781,7 +786,7 @@
}
}
- if (flags & VP8D_DEBUG_LEVEL3)
+ if (flags & VP8D_DEBUG_TXT_DC_DIFF)
{
int i, j;
unsigned char *y_ptr;
@@ -816,45 +821,14 @@
}
}
- if (flags & VP8D_DEBUG_LEVEL4)
+ if (flags & VP8D_DEBUG_TXT_RATE_INFO)
{
sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
-#if 0
- int i, j;
- unsigned char *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
- int mb_rows = post->y_height >> 4;
- int mb_cols = post->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = oci->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp8_filter each macro block */
- for (i = 0; i < mb_rows; i++)
- {
- for (j = 0; j < mb_cols; j++)
- {
- char zz[4];
-
- sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
- vp8_blit_text(zz, y_ptr, post->y_stride);
- mb_index ++;
- y_ptr += 16;
- }
-
- mb_index ++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
-
- }
-
-#endif
-
}
/* Draw motion vectors */
- if (flags & VP8D_DEBUG_DRAW_MV)
+ if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
{
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
int width = post->y_width;
@@ -871,6 +845,12 @@
{
int x1, y1;
+ if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+ {
+ mi++;
+ continue;
+ }
+
if (mi->mbmi.mode == SPLITMV)
{
switch (mi->mbmi.partitioning)
@@ -996,6 +976,7 @@
else
vp8_blit_line (lx0, x1, ly0, y1, y_buffer, y_stride);
}
+
mi++;
}
mi++;
@@ -1003,7 +984,8 @@
}
/* Color in block modes */
- if (flags & VP8D_DEBUG_CLR_BLK_MODES)
+ if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+ && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
{
int y, x;
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
@@ -1021,7 +1003,8 @@
{
int Y = 0, U = 0, V = 0;
- if (mi->mbmi.mode == B_PRED)
+ if (mi->mbmi.mode == B_PRED &&
+ ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
{
int by, bx;
unsigned char *yl, *ul, *vl;
@@ -1035,13 +1018,16 @@
{
for (bx = 0; bx < 16; bx += 4)
{
- Y = B_PREDICTION_MODE_colors[bmi->mode][0];
- U = B_PREDICTION_MODE_colors[bmi->mode][1];
- V = B_PREDICTION_MODE_colors[bmi->mode][2];
+ if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+ || (ppflags->display_mb_modes_flag & B_PRED))
+ {
+ Y = B_PREDICTION_MODE_colors[bmi->mode][0];
+ U = B_PREDICTION_MODE_colors[bmi->mode][1];
+ V = B_PREDICTION_MODE_colors[bmi->mode][2];
- POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
- (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
-
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
+ (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+ }
bmi++;
}
@@ -1050,7 +1036,7 @@
vl += y_stride*1;
}
}
- else
+ else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
{
Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
@@ -1059,6 +1045,7 @@
POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)
(y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
}
+
mi++;
}
y_ptr += y_stride*16;
@@ -1070,7 +1057,7 @@
}
/* Color in frame reference blocks */
- if (flags & VP8D_DEBUG_CLR_FRM_REF_BLKS)
+ if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
{
int y, x;
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
@@ -1088,12 +1075,15 @@
{
int Y = 0, U = 0, V = 0;
- Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
- U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
- V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+ if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+ {
+ Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+ U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+ V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
- POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
- (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+ POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)
+ (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+ }
mi++;
}
@@ -1104,6 +1094,7 @@
mi++;
}
}
+#endif
*dest = oci->post_proc_buffer;
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -111,7 +111,7 @@
#include "onyxc_int.h"
#include "ppflags.h"
int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
- int deblock_level, int noise_level, int flags);
+ vp8_ppflags_t *flags);
void vp8_de_noise(YV12_BUFFER_CONFIG *source,
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -17,13 +17,24 @@
VP8D_DEBLOCK = 1<<0,
VP8D_DEMACROBLOCK = 1<<1,
VP8D_ADDNOISE = 1<<2,
- VP8D_DEBUG_LEVEL1 = 1<<3,
- VP8D_DEBUG_LEVEL2 = 1<<4,
- VP8D_DEBUG_LEVEL3 = 1<<5,
- VP8D_DEBUG_LEVEL4 = 1<<6,
+ VP8D_DEBUG_TXT_FRAME_INFO = 1<<3,
+ VP8D_DEBUG_TXT_MBLK_MODES = 1<<4,
+ VP8D_DEBUG_TXT_DC_DIFF = 1<<5,
+ VP8D_DEBUG_TXT_RATE_INFO = 1<<6,
VP8D_DEBUG_DRAW_MV = 1<<7,
VP8D_DEBUG_CLR_BLK_MODES = 1<<8,
VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9
};
+
+typedef struct
+{
+ int post_proc_flag;
+ int deblocking_level;
+ int noise_level;
+ int display_ref_frame_flag;
+ int display_mb_modes_flag;
+ int display_b_modes_flag;
+ int display_mv_flag;
+} vp8_ppflags_t;
#endif
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -506,7 +506,7 @@
pbi->common.error.setjmp = 0;
return retcode;
}
-int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level, int noise_level, int flags)
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
{
int ret = -1;
VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -524,7 +524,7 @@
sd->clrtype = pbi->common.clr_type;
#if CONFIG_POSTPROC
- ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+ ret = vp8_post_proc_frame(&pbi->common, sd, flags);
#else
if (pbi->common.frame_to_show)
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -62,7 +62,6 @@
static const int qrounding_factors[129] =
{
- 56, 56, 56, 56, 48, 48, 56, 56,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
@@ -78,12 +77,18 @@
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
- 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48
};
static const int qzbin_factors[129] =
{
- 72, 72, 72, 72, 80, 80, 72, 72,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
@@ -94,17 +99,11 @@
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80,
+ 80
};
static const int qrounding_factors_y2[129] =
{
- 56, 56, 56, 56, 48, 48, 56, 56,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
@@ -120,12 +119,18 @@
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48,
- 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48
};
static const int qzbin_factors_y2[129] =
{
- 72, 72, 72, 72, 80, 80, 72, 72,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
@@ -136,15 +141,10 @@
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80,
+ 80
};
-//#define EXACT_QUANT
+#define EXACT_QUANT
#ifdef EXACT_QUANT
static void vp8cx_invert_quant(short *quant, short *shift, short d)
{
@@ -351,6 +351,9 @@
void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
{
+ // Clear Zbin mode boost for default case
+ cpi->zbin_mode_boost = 0;
+
// vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
// when these values are not all zero.
if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
@@ -1214,11 +1217,25 @@
// Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
if (cpi->zbin_mode_boost_enabled)
{
- if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME))
- cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
+ cpi->zbin_mode_boost = 0;
else
- cpi->zbin_mode_boost = 0;
+ {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV)
+ {
+ if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ }
+ else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
}
+ else
+ cpi->zbin_mode_boost = 0;
vp8cx_mb_init_quantizer(cpi, x);
}
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3494,8 +3494,18 @@
cpi->zbin_over_quant = 0;
cpi->zbin_mode_boost = 0;
- // Enable mode based tweaking of the zbin
+ // Enable or disable mode based tweaking of the zbin
+ // For 2 Pass Only used where GF/ARF prediction quality
+ // is above a threshold
+ cpi->zbin_mode_boost = 0;
cpi->zbin_mode_boost_enabled = TRUE;
+ if (cpi->pass == 2)
+ {
+ if ( cpi->gfu_boost <= 400 )
+ {
+ cpi->zbin_mode_boost_enabled = FALSE;
+ }
+ }
// Current default encoder behaviour for the altref sign bias
if (cpi->source_alt_ref_active)
@@ -5214,7 +5224,7 @@
return 0;
}
-int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
{
VP8_COMP *cpi = (VP8_COMP *) comp;
@@ -5224,7 +5234,7 @@
{
int ret;
#if CONFIG_POSTPROC
- ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
+ ret = vp8_post_proc_frame(&cpi->common, dest, flags);
#else
if (cpi->common.frame_to_show)
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -46,6 +46,8 @@
#define MAX_THRESHMULT 512
#define GF_ZEROMV_ZBIN_BOOST 24
+#define LF_ZEROMV_ZBIN_BOOST 12
+#define MV_ZBIN_BOOST 4
#define ZBIN_OQ_MAX 192
#define VP8_TEMPORAL_ALT_REF 1
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,7 +16,7 @@
#include "entropy.h"
#include "predictdc.h"
-//#define EXACT_QUANT
+#define EXACT_QUANT
#ifdef EXACT_QUANT
void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
{
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1626,10 +1626,22 @@
// Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
if (cpi->zbin_mode_boost_enabled)
{
- if ((vp8_mode_order[mode_index] == ZEROMV) && (vp8_ref_frame_order[mode_index] != LAST_FRAME))
- cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
- else
+ if ( vp8_ref_frame_order[mode_index] == INTRA_FRAME )
cpi->zbin_mode_boost = 0;
+ else
+ {
+ if (vp8_mode_order[mode_index] == ZEROMV)
+ {
+ if (vp8_ref_frame_order[mode_index] != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ }
+ else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
vp8cx_mb_init_quantizer(cpi, x);
}
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,511 +11,231 @@
%include "vpx_ports/x86_abi_support.asm"
-section .text
- global sym(vp8_short_fdct4x4_mmx)
- global sym(vp8_short_fdct8x4_wmt)
-
-
-%define DCTCONSTANTSBITS (16)
-%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
-%define x_c1 (60547) ; cos(pi /8) * (1<<15)
-%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
-%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-
-
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
sym(vp8_short_fdct4x4_mmx):
push rbp
- mov rbp, rsp
+ mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
- push rsi
- push rdi
+ push rsi
+ push rdi
; end prolog
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
- lea rdx, [GLOBAL(dct_const_mmx)]
- movsxd rax, dword ptr arg(2) ;pitch
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
- lea rcx, [rsi + rax*2]
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
; read the input data
- movq mm0, [rsi]
- movq mm1, [rsi + rax ]
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax]
- movq mm2, [rcx]
- movq mm3, [rcx + rax]
- ; get the constants
- ;shift to left by 1 for prescision
- psllw mm0, 3
- psllw mm1, 3
+ movq mm2, [rcx]
+ movq mm4, [rcx + rax]
- psllw mm2, 3
- psllw mm3, 3
+ ; transpose for the first stage
+ movq mm3, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 20 21 22 23
- ; transpose for the second stage
- movq mm4, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 10 11 12 03
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm3, mm1 ; 02 12 03 13
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm4, mm1 ; 02 12 03 13
+ punpcklwd mm2, mm4 ; 20 30 21 31
+ punpckhwd mm5, mm4 ; 22 32 23 33
- punpcklwd mm2, mm3 ; 20 30 21 31
- punpckhwd mm5, mm3 ; 22 32 23 33
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+ punpckhdq mm1, mm2 ; 01 11 21 31
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
+ movq mm2, mm3 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
- punpckhdq mm1, mm2 ; 01 11 21 31
+ punpckhdq mm3, mm5 ; 03 13 23 33
- movq mm2, mm4 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 3
- punpckhdq mm4, mm5 ; 03 13 23 33
- movq mm3, mm4
-
-
; first stage
- movq mm5, mm0
- movq mm4, mm1
+ movq mm5, mm0
+ movq mm4, mm1
- paddw mm0, mm3 ; a = 0 + 3
- paddw mm1, mm2 ; b = 1 + 2
+ paddw mm0, mm3 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
- psubw mm4, mm2 ; c = 1 - 2
- psubw mm5, mm3 ; d = 0 - 3
+ psubw mm4, mm2 ; c1 = 1 - 2
+ psubw mm5, mm3 ; d1 = 0 - 3
+ psllw mm5, 3
+ psllw mm4, 3
- ; output 0 and 2
- movq mm6, [rdx + 16] ; c2
- movq mm2, mm0 ; a
+ psllw mm0, 3
+ psllw mm1, 3
- paddw mm0, mm1 ; a + b
- psubw mm2, mm1 ; a - b
-
- movq mm1, mm0 ; a + b
- pmulhw mm0, mm6 ; 00 01 02 03
-
- paddw mm0, mm1 ; output 00 01 02 03
- pmulhw mm6, mm2 ; 20 21 22 23
-
- paddw mm2, mm6 ; output 20 21 22 23
-
- ; output 1 and 3
- movq mm6, [rdx + 8] ; c1
- movq mm7, [rdx + 24] ; c3
-
- movq mm1, mm4 ; c
- movq mm3, mm5 ; d
-
- pmulhw mm1, mm7 ; c * c3
- pmulhw mm3, mm6 ; d * c1
-
- paddw mm3, mm5 ; d * c1 rounded
- paddw mm1, mm3 ; output 10 11 12 13
-
- movq mm3, mm4 ; c
- pmulhw mm5, mm7 ; d * c3
-
- pmulhw mm4, mm6 ; c * c1
- paddw mm3, mm4 ; round c* c1
-
- psubw mm5, mm3 ; output 30 31 32 33
- movq mm3, mm5
-
-
- ; done with vertical
- ; transpose for the second stage
- movq mm4, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 10 11 12 03
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm4, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm3 ; 20 30 21 31
- punpckhwd mm5, mm3 ; 22 32 23 33
-
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm4 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm4, mm5 ; 03 13 23 33
- movq mm3, mm4
-
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
-
- paddw mm0, mm3 ; a = 0 + 3
- paddw mm1, mm2 ; b = 1 + 2
-
- psubw mm4, mm2 ; c = 1 - 2
- psubw mm5, mm3 ; d = 0 - 3
-
-
; output 0 and 2
- movq mm6, [rdx + 16] ; c2
- movq mm2, mm0 ; a
- paddw mm0, mm1 ; a + b
+ movq mm2, mm0 ; a1
- psubw mm2, mm1 ; a - b
+ paddw mm0, mm1 ; op[0] = a1 + b1
+ psubw mm2, mm1 ; op[2] = a1 - b1
- movq mm1, mm0 ; a + b
- pmulhw mm0, mm6 ; 00 01 02 03
-
- paddw mm0, mm1 ; output 00 01 02 03
- pmulhw mm6, mm2 ; 20 21 22 23
-
- paddw mm2, mm6 ; output 20 21 22 23
-
-
; output 1 and 3
- movq mm6, [rdx + 8] ; c1
- movq mm7, [rdx + 24] ; c3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm4 ; c1 d1
+ punpckhwd mm5, mm4 ; c1 d1
- movq mm1, mm4 ; c
- movq mm3, mm5 ; d
+ movq mm3, mm1
+ movq mm4, mm5
- pmulhw mm1, mm7 ; c * c3
- pmulhw mm3, mm6 ; d * c1
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- paddw mm3, mm5 ; d * c1 rounded
- paddw mm1, mm3 ; output 10 11 12 13
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- movq mm3, mm4 ; c
- pmulhw mm5, mm7 ; d * c3
+ paddd mm1, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm4, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm3, MMWORD PTR[GLOBAL(_7500)]
+ paddd mm5, MMWORD PTR[GLOBAL(_7500)]
- pmulhw mm4, mm6 ; c * c1
- paddw mm3, mm4 ; round c* c1
+ psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
- psubw mm5, mm3 ; output 30 31 32 33
- movq mm3, mm5
- ; done with vertical
+ packssdw mm1, mm4 ; op[1]
+ packssdw mm3, mm5 ; op[3]
- pcmpeqw mm4, mm4
- pcmpeqw mm5, mm5
- psrlw mm4, 15
- psrlw mm5, 15
-
- psllw mm4, 2
- psllw mm5, 2
-
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm4
- paddw mm3, mm5
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
- movq [rdi ], mm0
- movq [rdi+ 8], mm1
- movq [rdi+16], mm2
- movq [rdi+24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- lea rdx, [GLOBAL(dct_const_xmm)]
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
- ; read the input data
- movdqa xmm0, [rsi]
- movdqa xmm2, [rsi + rax]
-
- movdqa xmm4, [rcx]
- movdqa xmm3, [rcx + rax]
- ; get the constants
- ;shift to left by 1 for prescision
- psllw xmm0, 3
- psllw xmm2, 3
-
- psllw xmm4, 3
- psllw xmm3, 3
-
- ; transpose for the second stage
- movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
- movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
-
- punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
- punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
-
- punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
- punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
-
- movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
- punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
-
- punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
-
-
- movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
- punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
-
- punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
- movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
-
- punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
- punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
-
- movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
- punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
-
- punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
-
- ; xmm0 0
- ; xmm1 1
- ; xmm2 2
- ; xmm3 3
-
- ; first stage
- movdqa xmm5, xmm0
- movdqa xmm4, xmm1
-
- paddw xmm0, xmm3 ; a = 0 + 3
- paddw xmm1, xmm2 ; b = 1 + 2
-
- psubw xmm4, xmm2 ; c = 1 - 2
- psubw xmm5, xmm3 ; d = 0 - 3
-
-
- ; output 0 and 2
- movdqa xmm6, [rdx + 32] ; c2
- movdqa xmm2, xmm0 ; a
-
- paddw xmm0, xmm1 ; a + b
- psubw xmm2, xmm1 ; a - b
-
- movdqa xmm1, xmm0 ; a + b
- pmulhw xmm0, xmm6 ; 00 01 02 03
-
- paddw xmm0, xmm1 ; output 00 01 02 03
- pmulhw xmm6, xmm2 ; 20 21 22 23
-
- paddw xmm2, xmm6 ; output 20 21 22 23
-
- ; output 1 and 3
- movdqa xmm6, [rdx + 16] ; c1
- movdqa xmm7, [rdx + 48] ; c3
-
- movdqa xmm1, xmm4 ; c
- movdqa xmm3, xmm5 ; d
-
- pmulhw xmm1, xmm7 ; c * c3
- pmulhw xmm3, xmm6 ; d * c1
-
- paddw xmm3, xmm5 ; d * c1 rounded
- paddw xmm1, xmm3 ; output 10 11 12 13
-
- movdqa xmm3, xmm4 ; c
- pmulhw xmm5, xmm7 ; d * c3
-
- pmulhw xmm4, xmm6 ; c * c1
- paddw xmm3, xmm4 ; round c* c1
-
- psubw xmm5, xmm3 ; output 30 31 32 33
- movdqa xmm3, xmm5
-
-
; done with vertical
; transpose for the second stage
- movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
- movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
+ movq mm4, mm0 ; 00 10 20 30
+ movq mm5, mm2 ; 02 12 22 32
- movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
- movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
+ punpcklwd mm0, mm1 ; 00 01 10 11
+ punpckhwd mm4, mm1 ; 20 21 30 31
- punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
- punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
+ punpcklwd mm2, mm3 ; 02 03 12 13
+ punpckhwd mm5, mm3 ; 22 23 32 33
- punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
- punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+ movq mm1, mm0 ; 00 01 10 11
+ punpckldq mm0, mm2 ; 00 01 02 03
- movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
- punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
+ punpckhdq mm1, mm2 ; 01 22 12 13
- punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
+ movq mm2, mm4 ; 20 31 30 31
+ punpckldq mm2, mm5 ; 20 21 22 23
+ punpckhdq mm4, mm5 ; 30 31 32 33
- movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
- punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 4
- punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
- movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
+ movq mm5, mm0
+ movq mm3, mm1
- punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
- punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
+ paddw mm0, mm4 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
- movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
- punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
+ psubw mm3, mm2 ; c1 = 1 - 2
+ psubw mm5, mm4 ; d1 = 0 - 3
- punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
+ pxor mm6, mm6 ; zero out for compare
- ; first stage
- movdqa xmm5, xmm0
- movdqa xmm4, xmm1
+ pcmpeqw mm6, mm5 ; d1 != 0
- paddw xmm0, xmm3 ; a = 0 + 3
- paddw xmm1, xmm2 ; b = 1 + 2
+ pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
+ ; and keep bit 0 of lower
- psubw xmm4, xmm2 ; c = 1 - 2
- psubw xmm5, xmm3 ; d = 0 - 3
-
-
; output 0 and 2
- movdqa xmm6, [rdx + 32] ; c2
- movdqa xmm2, xmm0 ; a
+ movq mm2, mm0 ; a1
- paddw xmm0, xmm1 ; a + b
- psubw xmm2, xmm1 ; a - b
+ paddw mm0, mm1 ; a1 + b1
+ psubw mm2, mm1 ; a1 - b1
- movdqa xmm1, xmm0 ; a + b
- pmulhw xmm0, xmm6 ; 00 01 02 03
+ paddw mm0, MMWORD PTR[GLOBAL(_7w)]
+ paddw mm2, MMWORD PTR[GLOBAL(_7w)]
- paddw xmm0, xmm1 ; output 00 01 02 03
- pmulhw xmm6, xmm2 ; 20 21 22 23
+ psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
- paddw xmm2, xmm6 ; output 20 21 22 23
+ movq MMWORD PTR[rdi + 0 ], mm0
+ movq MMWORD PTR[rdi + 16], mm2
; output 1 and 3
- movdqa xmm6, [rdx + 16] ; c1
- movdqa xmm7, [rdx + 48] ; c3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm3 ; c1 d1
+ punpckhwd mm5, mm3 ; c1 d1
- movdqa xmm1, xmm4 ; c
- movdqa xmm3, xmm5 ; d
+ movq mm3, mm1
+ movq mm4, mm5
- pmulhw xmm1, xmm7 ; c * c3
- pmulhw xmm3, xmm6 ; d * c1
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- paddw xmm3, xmm5 ; d * c1 rounded
- paddw xmm1, xmm3 ; output 10 11 12 13
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- movdqa xmm3, xmm4 ; c
- pmulhw xmm5, xmm7 ; d * c3
+ paddd mm1, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm4, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm3, MMWORD PTR[GLOBAL(_51000)]
+ paddd mm5, MMWORD PTR[GLOBAL(_51000)]
- pmulhw xmm4, xmm6 ; c * c1
- paddw xmm3, xmm4 ; round c* c1
+ psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psubw xmm5, xmm3 ; output 30 31 32 33
- movdqa xmm3, xmm5
- ; done with vertical
+ packssdw mm1, mm4 ; op[4]
+ packssdw mm3, mm5 ; op[12]
+ paddw mm1, mm6 ; op[4] += (d1!=0)
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm5, xmm5;
- psrlw xmm4, 15
- psrlw xmm5, 15
+ movq MMWORD PTR[rdi + 8 ], mm1
+ movq MMWORD PTR[rdi + 24], mm3
- psllw xmm4, 2
- psllw xmm5, 2
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
- psraw xmm0, 3
- psraw xmm1, 3
- psraw xmm2, 3
- psraw xmm3, 3
-
- movq QWORD PTR[rdi ], xmm0
- movq QWORD PTR[rdi+ 8], xmm1
- movq QWORD PTR[rdi+16], xmm2
- movq QWORD PTR[rdi+24], xmm3
-
- psrldq xmm0, 8
- psrldq xmm1, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
-
- movq QWORD PTR[rdi+32], xmm0
- movq QWORD PTR[rdi+40], xmm1
- movq QWORD PTR[rdi+48], xmm2
- movq QWORD PTR[rdi+56], xmm3
- ; begin epilog
- pop rdi
- pop rsi
+ ; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
-
SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
- times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
- times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
- times 4 dw 23170
-
- dw 30274
- dw 12540
- dw -12540
- dw -30274
-
- dw 23170
- times 2 dw -23170
- dw 23170
-
- dw 12540
- dw -30274
- dw 30274
- dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
- times 4 dw 0
- times 4 dw 60547
- times 4 dw 46341
- times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
- times 8 dw 0
- times 8 dw 60547
- times 8 dw 46341
- times 8 dw 25080
+align 8
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 8
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 8
+_cmp_mask:
+ times 4 dw 1
+align 8
+_7w:
+ times 4 dw 7
+align 8
+_14500:
+ times 2 dd 14500
+align 8
+_7500:
+ times 2 dd 7500
+align 8
+_12000:
+ times 2 dd 12000
+align 8
+_51000:
+ times 2 dd 51000
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,13 +11,13 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+ %define input rsi
+ %define output rdi
+ %define pitch rax
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
-;; SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@@ -24,19 +24,55 @@
; end prolog
mov rsi, arg(0)
- movsxd rax, DWORD PTR arg(2)
- lea rdi, [rsi + rax*2]
+ mov rdi, arg(1)
- movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
- movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
- movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
- movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
+ movsxd rax, dword ptr arg(2)
+ lea rcx, [rsi + rax*2]
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define input rcx
+ %define output rdx
+ %define pitch r8
+ %else
+ %define input rdi
+ %define output rsi
+ %define pitch rdx
+ %endif
+%endif
+%endmacro
+%macro STACK_FRAME_DESTROY 0
+ %define input
+ %define output
+ %define pitch
+
+%if ABI_IS_32BIT
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %endif
+%endif
+ ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ movq xmm0, MMWORD PTR[input ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
+ lea input, [input+2*pitch]
+ movq xmm1, MMWORD PTR[input ] ;23 22 21 20
+ movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
+
punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
- mov rdi, arg(1)
-
movdqa xmm2, xmm0
punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
@@ -51,6 +87,7 @@
psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+
movdqa xmm1, xmm0
pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
@@ -121,18 +158,217 @@
punpcklqdq xmm0, xmm3 ;op[4] op[0]
punpckhqdq xmm1, xmm3 ;op[12] op[8]
- movdqa XMMWORD PTR[rdi + 0], xmm0
- movdqa XMMWORD PTR[rdi + 16], xmm1
+ movdqa XMMWORD PTR[output + 0], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm1
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
-;; RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ ; read the input data
+ movdqa xmm0, [input ]
+ movdqa xmm2, [input+ pitch]
+ lea input, [input+2*pitch]
+ movdqa xmm4, [input ]
+ movdqa xmm3, [input+ pitch]
+
+ ; transpose for the first stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm2 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ psllw xmm5, 3
+ psllw xmm4, 3
+
+ psllw xmm0, 3
+ psllw xmm1, 3
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; op[0] = a1 + b1
+ psubw xmm2, xmm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
+
+ psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm1, xmm4 ; op[1]
+ packssdw xmm3, xmm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
+
+ movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
+
+ ; xmm0 0
+ ; xmm1 4
+ ; xmm2 1
+ ; xmm3 3
+
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm4 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ pxor xmm6, xmm6 ; zero out for compare
+
+ pcmpeqw xmm6, xmm5 ; d1 != 0
+
+ pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; a1 + b1
+ psubw xmm2, xmm1 ; a1 - b1
+
+ paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
+ paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
+
+ psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
+
+ psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw xmm1, xmm4 ; op[4]
+ packssdw xmm3, xmm5 ; op[12]
+
+ paddw xmm1, xmm6 ; op[4] += (d1!=0)
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm4, xmm1
+
+ punpcklqdq xmm2, xmm3
+ punpckhqdq xmm5, xmm3
+
+ movdqa XMMWORD PTR[output + 0 ], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm2
+ movdqa XMMWORD PTR[output + 32], xmm4
+ movdqa XMMWORD PTR[output + 48], xmm5
+
+ STACK_FRAME_DESTROY
+
SECTION_RODATA
align 16
_5352_2217:
@@ -161,8 +397,10 @@
_cmp_mask:
times 4 dw 1
times 4 dw 0
-
align 16
+_cmp_mask8x4:
+ times 8 dw 1
+align 16
_mult_sub:
dw 1
dw -1
@@ -175,6 +413,9 @@
align 16
_7:
times 4 dd 7
+align 16
+_7w:
+ times 8 dw 7
align 16
_14500:
times 4 dd 14500
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,33 +24,31 @@
extern prototype_fdct(vp8_short_fdct8x4_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
-#if 0
+
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+
#endif
#endif
-#endif
#if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_sse2);
extern prototype_fdct(vp8_short_walsh4x4_sse2);
extern prototype_fdct(vp8_short_fdct4x4_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
-#if 1
-/* short SSE2 DCT currently disabled, does not match the MMX version */
+
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
-#endif
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -58,7 +56,7 @@
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
-#undef vp8_fdct_walsh_short4x4
+#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
#endif
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -8,24 +8,171 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
%include "vpx_ports/x86_abi_support.asm"
-%macro PROCESS_16X2X3 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm5, XMMWORD PTR [rdi]
- lddqu xmm6, XMMWORD PTR [rdi+1]
- lddqu xmm7, XMMWORD PTR [rdi+2]
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_err arg(4)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+8+4*8]
+ %define max_err [rsp+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_err r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_err
+
+%if ABI_IS_32BIT
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define r0_ptr rcx
+ %define r1_ptr rdx
+ %define r2_ptr rbx
+ %define r3_ptr rdi
+ %define ref_stride rbp
+ %define result_ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ push rbx
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ; src_ptr
+
+ movsxd rbx, dword ptr arg(1) ; src_stride
+ movsxd rbp, dword ptr arg(3) ; ref_stride
+
+ xchg rbx, rax
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define r0_ptr rsi
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr r8
+ %define ref_stride r9
+ %define result_ptr [rsp+16+4*8]
+ push rsi
+
+ LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define r0_ptr r9
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr rdx
+ %define ref_stride rcx
+ %define result_ptr r8
+
+ LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+ %define src_ptr
+ %define src_stride
+ %define r0_ptr
+ %define r1_ptr
+ %define r2_ptr
+ %define r3_ptr
+ %define ref_stride
+ %define result_ptr
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm5, XMMWORD PTR [%3]
+ lddqu xmm6, XMMWORD PTR [%3+1]
+ lddqu xmm7, XMMWORD PTR [%3+2]
+
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rdi]
- lddqu xmm2, XMMWORD PTR [rdi+1]
- lddqu xmm3, XMMWORD PTR [rdi+2]
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%3+1]
+ lddqu xmm3, XMMWORD PTR [%3+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -35,13 +182,15 @@
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rdi+rdx]
- lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
- lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
+ movdqa xmm0, XMMWORD PTR [%2+%4]
+ lddqu xmm1, XMMWORD PTR [%3+%5]
+ lddqu xmm2, XMMWORD PTR [%3+%5+1]
+ lddqu xmm3, XMMWORD PTR [%3+%5+2]
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -52,21 +201,21 @@
paddw xmm7, xmm3
%endmacro
-%macro PROCESS_8X2X3 1
-%if %1
- movq mm0, QWORD PTR [rsi]
- movq mm5, QWORD PTR [rdi]
- movq mm6, QWORD PTR [rdi+1]
- movq mm7, QWORD PTR [rdi+2]
+%macro PROCESS_8X2X3 5
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm5, QWORD PTR [%3]
+ movq mm6, QWORD PTR [%3+1]
+ movq mm7, QWORD PTR [%3+2]
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
- movq mm2, QWORD PTR [rdi+1]
- movq mm3, QWORD PTR [rdi+2]
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%3+1]
+ movq mm3, QWORD PTR [%3+2]
psadbw mm1, mm0
psadbw mm2, mm0
@@ -76,13 +225,15 @@
paddw mm6, mm2
paddw mm7, mm3
%endif
- movq mm0, QWORD PTR [rsi+rax]
- movq mm1, QWORD PTR [rdi+rdx]
- movq mm2, QWORD PTR [rdi+rdx+1]
- movq mm3, QWORD PTR [rdi+rdx+2]
+ movq mm0, QWORD PTR [%2+%4]
+ movq mm1, QWORD PTR [%3+%5]
+ movq mm2, QWORD PTR [%3+%5+1]
+ movq mm3, QWORD PTR [%3+%5+2]
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
psadbw mm1, mm0
psadbw mm2, mm0
@@ -101,13 +252,13 @@
mov %5, [%1+REG_SZ_BYTES*3]
%endmacro
-%macro PROCESS_16X2X4 1
-%if %1
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm4, XMMWORD PTR [rcx]
- lddqu xmm5, XMMWORD PTR [rdx]
- lddqu xmm6, XMMWORD PTR [rbx]
- lddqu xmm7, XMMWORD PTR [rdi]
+%macro PROCESS_16X2X4 8
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm4, XMMWORD PTR [%3]
+ lddqu xmm5, XMMWORD PTR [%4]
+ lddqu xmm6, XMMWORD PTR [%5]
+ lddqu xmm7, XMMWORD PTR [%6]
psadbw xmm4, xmm0
psadbw xmm5, xmm0
@@ -114,10 +265,10 @@
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
- movdqa xmm0, XMMWORD PTR [rsi]
- lddqu xmm1, XMMWORD PTR [rcx]
- lddqu xmm2, XMMWORD PTR [rdx]
- lddqu xmm3, XMMWORD PTR [rbx]
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%4]
+ lddqu xmm3, XMMWORD PTR [%5]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -124,7 +275,7 @@
psadbw xmm3, xmm0
paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm1, XMMWORD PTR [%6]
paddw xmm5, xmm2
paddw xmm6, xmm3
@@ -131,10 +282,10 @@
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endif
- movdqa xmm0, XMMWORD PTR [rsi+rax]
- lddqu xmm1, XMMWORD PTR [rcx+rbp]
- lddqu xmm2, XMMWORD PTR [rdx+rbp]
- lddqu xmm3, XMMWORD PTR [rbx+rbp]
+ movdqa xmm0, XMMWORD PTR [%2+%7]
+ lddqu xmm1, XMMWORD PTR [%3+%8]
+ lddqu xmm2, XMMWORD PTR [%4+%8]
+ lddqu xmm3, XMMWORD PTR [%5+%8]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
@@ -141,30 +292,31 @@
psadbw xmm3, xmm0
paddw xmm4, xmm1
- lddqu xmm1, XMMWORD PTR [rdi+rbp]
+ lddqu xmm1, XMMWORD PTR [%6+%8]
paddw xmm5, xmm2
paddw xmm6, xmm3
- lea rsi, [rsi+rax*2]
- lea rcx, [rcx+rbp*2]
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
- lea rdx, [rdx+rbp*2]
- lea rbx, [rbx+rbp*2]
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
- lea rdi, [rdi+rbp*2]
-
+ lea %6, [%6+%8*2]
+%endif
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endmacro
-%macro PROCESS_8X2X4 1
-%if %1
- movq mm0, QWORD PTR [rsi]
- movq mm4, QWORD PTR [rcx]
- movq mm5, QWORD PTR [rdx]
- movq mm6, QWORD PTR [rbx]
- movq mm7, QWORD PTR [rdi]
+%macro PROCESS_8X2X4 8
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm4, QWORD PTR [%3]
+ movq mm5, QWORD PTR [%4]
+ movq mm6, QWORD PTR [%5]
+ movq mm7, QWORD PTR [%6]
psadbw mm4, mm0
psadbw mm5, mm0
@@ -171,10 +323,10 @@
psadbw mm6, mm0
psadbw mm7, mm0
%else
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rcx]
- movq mm2, QWORD PTR [rdx]
- movq mm3, QWORD PTR [rbx]
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%4]
+ movq mm3, QWORD PTR [%5]
psadbw mm1, mm0
psadbw mm2, mm0
@@ -181,7 +333,7 @@
psadbw mm3, mm0
paddw mm4, mm1
- movq mm1, QWORD PTR [rdi]
+ movq mm1, QWORD PTR [%6]
paddw mm5, mm2
paddw mm6, mm3
@@ -188,10 +340,10 @@
psadbw mm1, mm0
paddw mm7, mm1
%endif
- movq mm0, QWORD PTR [rsi+rax]
- movq mm1, QWORD PTR [rcx+rbp]
- movq mm2, QWORD PTR [rdx+rbp]
- movq mm3, QWORD PTR [rbx+rbp]
+ movq mm0, QWORD PTR [%2+%7]
+ movq mm1, QWORD PTR [%3+%8]
+ movq mm2, QWORD PTR [%4+%8]
+ movq mm3, QWORD PTR [%5+%8]
psadbw mm1, mm0
psadbw mm2, mm0
@@ -198,18 +350,19 @@
psadbw mm3, mm0
paddw mm4, mm1
- movq mm1, QWORD PTR [rdi+rbp]
+ movq mm1, QWORD PTR [%6+%8]
paddw mm5, mm2
paddw mm6, mm3
- lea rsi, [rsi+rax*2]
- lea rcx, [rcx+rbp*2]
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
- lea rdx, [rdx+rbp*2]
- lea rbx, [rbx+rbp*2]
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
- lea rdi, [rdi+rbp*2]
-
+ lea %6, [%6+%8*2]
+%endif
psadbw mm1, mm0
paddw mm7, mm1
@@ -223,54 +376,39 @@
; int *results)
global sym(vp8_sad16x16x3_sse3)
sym(vp8_sad16x16x3_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
+ mov rcx, result_ptr
- mov rdi, arg(4) ;Results
-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
- movd [rdi], xmm0
+ movd [rcx], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
- movd [rdi+4], xmm0
+ movd [rcx+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
- movd [rdi+8], xmm0
+ movd [rcx+8], xmm0
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X3
;void int vp8_sad16x8x3_sse3(
; unsigned char *src_ptr,
@@ -280,50 +418,35 @@
; int *results)
global sym(vp8_sad16x8x3_sse3)
sym(vp8_sad16x8x3_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_16X2X3 1
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
- PROCESS_16X2X3 0
+ mov rcx, result_ptr
- mov rdi, arg(4) ;Results
-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
- movd [rdi], xmm0
+ movd [rcx], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
- movd [rdi+4], xmm0
+ movd [rcx+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
- movd [rdi+8], xmm0
+ movd [rcx+8], xmm0
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X3
;void int vp8_sad8x16x3_sse3(
; unsigned char *src_ptr,
@@ -333,40 +456,26 @@
; int *results)
global sym(vp8_sad8x16x3_sse3)
sym(vp8_sad8x16x3_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
+ mov rcx, result_ptr
- mov rdi, arg(4) ;Results
+ punpckldq mm5, mm6
- movd [rdi], mm5
- movd [rdi+4], mm6
- movd [rdi+8], mm7
+ movq [rcx], mm5
+ movd [rcx+8], mm7
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X3
;void int vp8_sad8x8x3_sse3(
; unsigned char *src_ptr,
@@ -376,36 +485,22 @@
; int *results)
global sym(vp8_sad8x8x3_sse3)
sym(vp8_sad8x8x3_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
- PROCESS_8X2X3 1
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
- PROCESS_8X2X3 0
+ mov rcx, result_ptr
- mov rdi, arg(4) ;Results
+ punpckldq mm5, mm6
- movd [rdi], mm5
- movd [rdi+4], mm6
- movd [rdi+8], mm7
+ movq [rcx], mm5
+ movd [rcx+8], mm7
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X3
;void int vp8_sad4x4x3_sse3(
; unsigned char *src_ptr,
@@ -415,33 +510,23 @@
; int *results)
global sym(vp8_sad4x4x3_sse3)
sym(vp8_sad4x4x3_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [ref_ptr]
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride]
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
punpcklbw mm0, mm2
punpcklbw mm1, mm3
- movd mm4, DWORD PTR [rdi+1]
- movd mm5, DWORD PTR [rdi+2]
+ movd mm4, DWORD PTR [ref_ptr+1]
+ movd mm5, DWORD PTR [ref_ptr+2]
- movd mm2, DWORD PTR [rdi+rdx+1]
- movd mm3, DWORD PTR [rdi+rdx+2]
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
psadbw mm1, mm0
@@ -451,29 +536,27 @@
psadbw mm4, mm0
psadbw mm5, mm0
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [ref_ptr]
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride]
- movd mm0, DWORD PTR [rsi]
- movd mm2, DWORD PTR [rdi]
-
- movd mm3, DWORD PTR [rsi+rax]
- movd mm6, DWORD PTR [rdi+rdx]
-
punpcklbw mm0, mm3
punpcklbw mm2, mm6
- movd mm3, DWORD PTR [rdi+1]
- movd mm7, DWORD PTR [rdi+2]
+ movd mm3, DWORD PTR [ref_ptr+1]
+ movd mm7, DWORD PTR [ref_ptr+2]
psadbw mm2, mm0
paddw mm1, mm2
- movd mm2, DWORD PTR [rdi+rdx+1]
- movd mm6, DWORD PTR [rdi+rdx+2]
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
punpcklbw mm3, mm2
punpcklbw mm7, mm6
@@ -484,19 +567,14 @@
paddw mm3, mm4
paddw mm7, mm5
- mov rdi, arg(4) ;Results
- movd [rdi], mm1
+ mov rcx, result_ptr
- movd [rdi+4], mm3
- movd [rdi+8], mm7
+ punpckldq mm1, mm3
+ movq [rcx], mm1
+ movd [rcx+8], mm7
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X3
;unsigned int vp8_sad16x16_sse3(
; unsigned char *src_ptr,
@@ -507,51 +585,40 @@
;%define lddqu movdqu
global sym(vp8_sad16x16_sse3)
sym(vp8_sad16x16_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rbx
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ STACK_FRAME_CREATE_X3
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ lea end_ptr, [src_ptr+src_stride*8]
- lea rcx, [rsi+rbx*8]
-
- lea rcx, [rcx+rbx*8]
+ lea end_ptr, [end_ptr+src_stride*8]
pxor mm7, mm7
-vp8_sad16x16_sse3_loop:
+.vp8_sad16x16_sse3_loop:
- movq rax, mm7
- cmp rax, arg(4)
- jg vp8_sad16x16_early_exit
+ movq ret_var, mm7
+ cmp ret_var, max_err
+ jg .vp8_sad16x16_early_exit
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
+ movq mm0, QWORD PTR [src_ptr]
+ movq mm2, QWORD PTR [src_ptr+8]
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
+ movq mm1, QWORD PTR [ref_ptr]
+ movq mm3, QWORD PTR [ref_ptr+8]
- movq mm4, QWORD PTR [rsi+rbx]
- movq mm5, QWORD PTR [rdi+rdx]
+ movq mm4, QWORD PTR [src_ptr+src_stride]
+ movq mm5, QWORD PTR [ref_ptr+ref_stride]
psadbw mm0, mm1
psadbw mm2, mm3
- movq mm1, QWORD PTR [rsi+rbx+8]
- movq mm3, QWORD PTR [rdi+rdx+8]
+ movq mm1, QWORD PTR [src_ptr+src_stride+8]
+ movq mm3, QWORD PTR [ref_ptr+ref_stride+8]
psadbw mm4, mm5
psadbw mm1, mm3
- lea rsi, [rsi+rbx*2]
- lea rdi, [rdi+rdx*2]
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
paddw mm0, mm2
paddw mm4, mm1
@@ -559,21 +626,17 @@
paddw mm7, mm0
paddw mm7, mm4
- cmp rsi, rcx
- jne vp8_sad16x16_sse3_loop
+ cmp src_ptr, end_ptr
+ jne .vp8_sad16x16_sse3_loop
- movq rax, mm7
+ movq ret_var, mm7
-vp8_sad16x16_early_exit:
+.vp8_sad16x16_early_exit:
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- UNSHADOW_ARGS
- pop rbp
- ret
+ mov rax, ret_var
+ STACK_FRAME_DESTROY_X3
+
;void vp8_sad16x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
@@ -582,69 +645,48 @@
; int *results)
global sym(vp8_sad16x16x4d_sse3)
sym(vp8_sad16x16x4d_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- push rbx
- ; end prolog
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
+ STACK_FRAME_CREATE_X4
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- mov rsi, arg(0) ;src_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rbp, dword ptr arg(3) ;ref_stride
-
- xchg rbx, rax
-
- PROCESS_16X2X4 1
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
-
+%if ABI_IS_32BIT
pop rbp
- mov rdi, arg(4) ;Results
+%endif
+ mov rcx, result_ptr
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
- movd [rdi], xmm0
+ movd [rcx], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
- movd [rdi+4], xmm0
+ movd [rcx+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
- movd [rdi+8], xmm0
+ movd [rcx+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
- movd [rdi+12], xmm0
+ movd [rcx+12], xmm0
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X4
;void vp8_sad16x8x4d_sse3(
; unsigned char *src_ptr,
@@ -654,65 +696,44 @@
; int *results)
global sym(vp8_sad16x8x4d_sse3)
sym(vp8_sad16x8x4d_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- push rbx
- ; end prolog
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
+ STACK_FRAME_CREATE_X4
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- mov rsi, arg(0) ;src_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rbp, dword ptr arg(3) ;ref_stride
-
- xchg rbx, rax
-
- PROCESS_16X2X4 1
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
- PROCESS_16X2X4 0
-
+%if ABI_IS_32BIT
pop rbp
- mov rdi, arg(4) ;Results
+%endif
+ mov rcx, result_ptr
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
- movd [rdi], xmm0
+ movd [rcx], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
- movd [rdi+4], xmm0
+ movd [rcx+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
- movd [rdi+8], xmm0
+ movd [rcx+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
- movd [rdi+12], xmm0
+ movd [rcx+12], xmm0
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X4
;void int vp8_sad8x16x4d_sse3(
; unsigned char *src_ptr,
@@ -722,51 +743,31 @@
; int *results)
global sym(vp8_sad8x16x4d_sse3)
sym(vp8_sad8x16x4d_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- push rbx
- ; end prolog
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
+ STACK_FRAME_CREATE_X4
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- mov rsi, arg(0) ;src_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rbp, dword ptr arg(3) ;ref_stride
-
- xchg rbx, rax
-
- PROCESS_8X2X4 1
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
-
+%if ABI_IS_32BIT
pop rbp
- mov rdi, arg(4) ;Results
+%endif
+ mov rcx, result_ptr
- movd [rdi], mm4
- movd [rdi+4], mm5
- movd [rdi+8], mm6
- movd [rdi+12], mm7
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+ STACK_FRAME_DESTROY_X4
+
;void int vp8_sad8x8x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
@@ -775,47 +776,27 @@
; int *results)
global sym(vp8_sad8x8x4d_sse3)
sym(vp8_sad8x8x4d_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- push rbx
- ; end prolog
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
+ STACK_FRAME_CREATE_X4
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
- mov rsi, arg(0) ;src_ptr
-
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rbp, dword ptr arg(3) ;ref_stride
-
- xchg rbx, rax
-
- PROCESS_8X2X4 1
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
- PROCESS_8X2X4 0
-
+%if ABI_IS_32BIT
pop rbp
- mov rdi, arg(4) ;Results
+%endif
+ mov rcx, result_ptr
- movd [rdi], mm4
- movd [rdi+4], mm5
- movd [rdi+8], mm6
- movd [rdi+12], mm7
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+ STACK_FRAME_DESTROY_X4
+
;void int vp8_sad4x4x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
@@ -824,43 +805,26 @@
; int *results)
global sym(vp8_sad4x4x4d_sse3)
sym(vp8_sad4x4x4d_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- push rbx
- ; end prolog
- push rbp
- mov rdi, arg(2) ; ref_ptr_base
+ STACK_FRAME_CREATE_X4
- LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [r0_ptr]
- mov rsi, arg(0) ;src_ptr
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [r0_ptr+ref_stride]
- movsxd rbx, dword ptr arg(1) ;src_stride
- movsxd rbp, dword ptr arg(3) ;ref_stride
-
- xchg rbx, rax
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rcx]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rcx+rbp]
-
punpcklbw mm0, mm2
punpcklbw mm1, mm3
- movd mm4, DWORD PTR [rdx]
- movd mm5, DWORD PTR [rbx]
+ movd mm4, DWORD PTR [r1_ptr]
+ movd mm5, DWORD PTR [r2_ptr]
- movd mm6, DWORD PTR [rdi]
- movd mm2, DWORD PTR [rdx+rbp]
+ movd mm6, DWORD PTR [r3_ptr]
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
- movd mm3, DWORD PTR [rbx+rbp]
- movd mm7, DWORD PTR [rdi+rbp]
+ movd mm3, DWORD PTR [r2_ptr+ref_stride]
+ movd mm7, DWORD PTR [r3_ptr+ref_stride]
psadbw mm1, mm0
@@ -875,37 +839,40 @@
- lea rsi, [rsi+rax*2]
- lea rcx, [rcx+rbp*2]
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea r0_ptr, [r0_ptr+ref_stride*2]
- lea rdx, [rdx+rbp*2]
- lea rbx, [rbx+rbp*2]
+ lea r1_ptr, [r1_ptr+ref_stride*2]
+ lea r2_ptr, [r2_ptr+ref_stride*2]
- lea rdi, [rdi+rbp*2]
+ lea r3_ptr, [r3_ptr+ref_stride*2]
- movd mm0, DWORD PTR [rsi]
- movd mm2, DWORD PTR [rcx]
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [r0_ptr]
- movd mm3, DWORD PTR [rsi+rax]
- movd mm7, DWORD PTR [rcx+rbp]
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm7, DWORD PTR [r0_ptr+ref_stride]
punpcklbw mm0, mm3
punpcklbw mm2, mm7
- movd mm3, DWORD PTR [rdx]
- movd mm7, DWORD PTR [rbx]
+ movd mm3, DWORD PTR [r1_ptr]
+ movd mm7, DWORD PTR [r2_ptr]
psadbw mm2, mm0
+%if ABI_IS_32BIT
mov rax, rbp
pop rbp
- mov rsi, arg(4) ;Results
+%define ref_stride rax
+%endif
+ mov rsi, result_ptr
paddw mm1, mm2
movd [rsi], mm1
- movd mm2, DWORD PTR [rdx+rax]
- movd mm1, DWORD PTR [rbx+rax]
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+ movd mm1, DWORD PTR [r2_ptr+ref_stride]
punpcklbw mm3, mm2
punpcklbw mm7, mm1
@@ -913,8 +880,8 @@
psadbw mm3, mm0
psadbw mm7, mm0
- movd mm2, DWORD PTR [rdi]
- movd mm1, DWORD PTR [rdi+rax]
+ movd mm2, DWORD PTR [r3_ptr]
+ movd mm1, DWORD PTR [r3_ptr+ref_stride]
paddw mm3, mm4
paddw mm7, mm5
@@ -929,10 +896,4 @@
movd [rsi+12], mm2
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
+ STACK_FRAME_DESTROY_X4
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,11 +18,10 @@
#if HAVE_MMX
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_c(input, output, pitch);
- vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_mmx(input, output, pitch);
+ vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
}
-
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
@@ -82,12 +81,6 @@
#endif
#if HAVE_SSE2
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-{
- vp8_short_fdct4x4_sse2(input, output, pitch);
- vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
@@ -249,18 +242,11 @@
cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
-#if 0 // new fdct
+
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
-#else
- cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
- cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
- cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
- cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
-
-#endif
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -881,8 +881,16 @@
{
YV12_BUFFER_CONFIG sd;
+ vp8_ppflags_t flags = {0};
- if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, ctx->preview_ppcfg.deblocking_level, ctx->preview_ppcfg.noise_level, ctx->preview_ppcfg.post_proc_flag))
+ if (ctx->preview_ppcfg.post_proc_flag)
+ {
+ flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag;
+ flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
+ flags.noise_level = ctx->preview_ppcfg.noise_level;
+ }
+
+ if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
{
/*
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -65,12 +65,19 @@
vpx_codec_priv_t base;
vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs)-1];
vpx_codec_dec_cfg_t cfg;
- vp8_stream_info_t si;
+ vp8_stream_info_t si;
int defer_alloc;
int decoder_init;
VP8D_PTR pbi;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+ unsigned int dbg_postproc_flag;
+ int dbg_color_ref_frame_flag;
+ int dbg_color_mb_modes_flag;
+ int dbg_color_b_modes_flag;
+ int dbg_display_mv_flag;
+#endif
vpx_image_t img;
int img_setup;
int img_avail;
@@ -416,15 +423,27 @@
{
YV12_BUFFER_CONFIG sd;
INT64 time_stamp = 0, time_end_stamp = 0;
- int ppflag = 0;
- int ppdeblocking = 0;
- int ppnoise = 0;
+ vp8_ppflags_t flags = {0};
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
{
- ppflag = ctx->postproc_cfg.post_proc_flag;
- ppdeblocking = ctx->postproc_cfg.deblocking_level;
- ppnoise = ctx->postproc_cfg.noise_level;
+ flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+ | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+ | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+ ;
+ flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+ flags.noise_level = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+ flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+ flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+ flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+ flags.display_mv_flag = ctx->dbg_display_mv_flag;
+#endif
}
if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
@@ -433,7 +452,7 @@
res = update_error_state(ctx, &pbi->common.error);
}
- if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, ppdeblocking, ppnoise, ppflag))
+ if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
{
/* Align width/height */
unsigned int a_w = (sd.y_width + 15) & ~15;
@@ -646,12 +665,38 @@
#endif
}
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ int data = va_arg(args, int);
+#define MAP(id, var) case id: var = data; break;
+
+ switch (ctrl_id)
+ {
+ MAP (VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag);
+ MAP (VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag);
+ MAP (VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag);
+ MAP (VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag);
+ }
+
+ return VPX_CODEC_OK;
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
{
- {VP8_SET_REFERENCE, vp8_set_reference},
- {VP8_COPY_REFERENCE, vp8_get_reference},
- {VP8_SET_POSTPROC, vp8_set_postproc},
+ {VP8_SET_REFERENCE, vp8_set_reference},
+ {VP8_COPY_REFERENCE, vp8_get_reference},
+ {VP8_SET_POSTPROC, vp8_set_postproc},
+ {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options},
{ -1, NULL},
};
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -38,9 +38,13 @@
*/
enum vp8_dec_control_id
{
- VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */
- VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
- VP8_SET_POSTPROC = 3, /**< set decoder's the post processing settings */
+ VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */
+ VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
+ VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */
+ VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< set the reference frames to color for each macroblock */
+ VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
+ VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */
+ VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */
VP8_COMMON_CTRL_ID_MAX
};
@@ -50,10 +54,14 @@
*/
enum vp8_postproc_level
{
- VP8_NOFILTERING = 0,
- VP8_DEBLOCK = 1,
- VP8_DEMACROBLOCK = 2,
- VP8_ADDNOISE = 4
+ VP8_NOFILTERING = 0,
+ VP8_DEBLOCK = 1<<0,
+ VP8_DEMACROBLOCK = 1<<1,
+ VP8_ADDNOISE = 1<<2,
+ VP8_DEBUG_TXT_FRAME_INFO = 1<<3, /**< print frame information */
+ VP8_DEBUG_TXT_MBLK_MODES = 1<<4, /**< print macro block modes over each macro block */
+ VP8_DEBUG_TXT_DC_DIFF = 1<<5, /**< print dc diff for each macro block */
+ VP8_DEBUG_TXT_RATE_INFO = 1<<6, /**< print video rate info (encoder only) */
};
/*!\brief post process flags
@@ -65,9 +73,9 @@
typedef struct vp8_postproc_cfg
{
- int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
- int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
- int noise_level; /**< the strength of additive noise, valid range [0, 16] */
+ int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+ int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
+ int noise_level; /**< the strength of additive noise, valid range [0, 16] */
} vp8_postproc_cfg_t;
/*!\brief reference frame type
@@ -95,12 +103,16 @@
/*!\brief vp8 decoder control funciton parameter type
*
- * defines the data type for each of VP8 decoder control funciton requires
+ * defines the data type for each of VP8 decoder control function requires
*/
VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *)
VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int)
/*! @} - end defgroup vp8 */
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -108,11 +108,19 @@
"Enable VP8 demacroblocking, w/ level");
static const arg_def_t pp_debug_info = ARG_DEF(NULL, "pp-debug-info", 1,
"Enable VP8 visible debug info");
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(NULL, "pp-dbg-ref-frame", 1,
+ "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(NULL, "pp-dbg-mb-modes", 1,
+ "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(NULL, "pp-dbg-b-modes", 1,
+ "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(NULL, "pp-dbg-mvs", 1,
+ "Draw only selected motion vectors");
-
static const arg_def_t *vp8_pp_args[] =
{
&addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+ &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs,
NULL
};
#endif
@@ -705,6 +713,10 @@
vpx_codec_dec_cfg_t cfg = {0};
#if CONFIG_VP8_DECODER
vp8_postproc_cfg_t vp8_pp_cfg = {0};
+ int vp8_dbg_color_ref_frame = 0;
+ int vp8_dbg_color_mb_modes = 0;
+ int vp8_dbg_color_b_modes = 0;
+ int vp8_dbg_display_mv = 0;
#endif
struct input_ctx input = {0};
@@ -790,6 +802,42 @@
if (level)
vp8_pp_cfg.post_proc_flag |= level;
}
+ else if (arg_match(&arg, &pp_disp_ref_frame, argi))
+ {
+ unsigned int flags = arg_parse_int(&arg);
+ if (flags)
+ {
+ postproc = 1;
+ vp8_dbg_color_ref_frame = flags;
+ }
+ }
+ else if (arg_match(&arg, &pp_disp_mb_modes, argi))
+ {
+ unsigned int flags = arg_parse_int(&arg);
+ if (flags)
+ {
+ postproc = 1;
+ vp8_dbg_color_mb_modes = flags;
+ }
+ }
+ else if (arg_match(&arg, &pp_disp_b_modes, argi))
+ {
+ unsigned int flags = arg_parse_int(&arg);
+ if (flags)
+ {
+ postproc = 1;
+ vp8_dbg_color_b_modes = flags;
+ }
+ }
+ else if (arg_match(&arg, &pp_disp_mvs, argi))
+ {
+ unsigned int flags = arg_parse_int(&arg);
+ if (flags)
+ {
+ postproc = 1;
+ vp8_dbg_display_mv = flags;
+ }
+ }
#endif
else
@@ -929,6 +977,33 @@
return EXIT_FAILURE;
}
+ if (vp8_dbg_color_ref_frame
+ && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME, vp8_dbg_color_ref_frame))
+ {
+ fprintf(stderr, "Failed to configure reference block visualizer: %s\n", vpx_codec_error(&decoder));
+ return EXIT_FAILURE;
+ }
+
+ if (vp8_dbg_color_mb_modes
+ && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES, vp8_dbg_color_mb_modes))
+ {
+ fprintf(stderr, "Failed to configure macro block visualizer: %s\n", vpx_codec_error(&decoder));
+ return EXIT_FAILURE;
+ }
+
+ if (vp8_dbg_color_b_modes
+ && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES, vp8_dbg_color_b_modes))
+ {
+ fprintf(stderr, "Failed to configure block visualizer: %s\n", vpx_codec_error(&decoder));
+ return EXIT_FAILURE;
+ }
+
+ if (vp8_dbg_display_mv
+ && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV, vp8_dbg_display_mv))
+ {
+ fprintf(stderr, "Failed to configure motion vector visualizer: %s\n", vpx_codec_error(&decoder));
+ return EXIT_FAILURE;
+ }
#endif
/* Decode file */