ref: f1c56a8c8cb0959734a14446c60c132de676388a
parent: 879cb7d96259a71eea0038452a00241650589084
parent: 8d568312a2e6882a336eb3525fbe6b9e752163f3
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Jan 8 07:59:08 EST 2013
Merge "vp9_sub_pixel_variance16x2 SSE2 optimization" into experimental
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -185,33 +185,33 @@
offset = ref_y_stride * row_offset + col_offset;
score = 0;
if (xd->up_available) {
- vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
- SP(this_mv.as_mv.col),
- SP(this_mv.as_mv.row),
- above_src, xd->dst.y_stride, &sse);
+ vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
+ SP(this_mv.as_mv.col),
+ SP(this_mv.as_mv.row),
+ above_src, xd->dst.y_stride, &sse);
score += sse;
#if CONFIG_SUPERBLOCKS
if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
- vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
- ref_y_stride,
- SP(this_mv.as_mv.col),
- SP(this_mv.as_mv.row),
- above_src + 16, xd->dst.y_stride, &sse);
+ vp9_sub_pixel_variance16x2(above_ref + offset + 16,
+ ref_y_stride,
+ SP(this_mv.as_mv.col),
+ SP(this_mv.as_mv.row),
+ above_src + 16, xd->dst.y_stride, &sse);
score += sse;
}
#if CONFIG_SUPERBLOCKS64
if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
- vp9_sub_pixel_variance16x2_c(above_ref + offset + 32,
- ref_y_stride,
- SP(this_mv.as_mv.col),
- SP(this_mv.as_mv.row),
- above_src + 32, xd->dst.y_stride, &sse);
+ vp9_sub_pixel_variance16x2(above_ref + offset + 32,
+ ref_y_stride,
+ SP(this_mv.as_mv.col),
+ SP(this_mv.as_mv.row),
+ above_src + 32, xd->dst.y_stride, &sse);
score += sse;
- vp9_sub_pixel_variance16x2_c(above_ref + offset + 48,
- ref_y_stride,
- SP(this_mv.as_mv.col),
- SP(this_mv.as_mv.row),
- above_src + 48, xd->dst.y_stride, &sse);
+ vp9_sub_pixel_variance16x2(above_ref + offset + 48,
+ ref_y_stride,
+ SP(this_mv.as_mv.col),
+ SP(this_mv.as_mv.row),
+ above_src + 48, xd->dst.y_stride, &sse);
score += sse;
}
#endif
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -254,6 +254,11 @@
prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"
specialize vp9_sad3x16 sse2
+if [ "$CONFIG_SUBPELREFMV" = "yes" ]; then
+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x2 sse2
+fi
+
#
# Sub Pixel Filters
#
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c
@@ -1,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define HALFNDX 8
+
+void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared);
+
+void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared);
+
+void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared);
+
+void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared);
+
+unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ if (xoffset == HALFNDX && yoffset == 0) {
+ vp9_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 2,
+ &xsum0, &xxsum0);
+ } else if (xoffset == 0 && yoffset == HALFNDX) {
+ vp9_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 2,
+ &xsum0, &xxsum0);
+ } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+ vp9_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 2,
+ &xsum0, &xxsum0);
+ } else {
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 2,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+
+ vp9_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 2,
+ xoffset, yoffset,
+ &xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
+}
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -96,6 +96,9 @@
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+ifeq ($(CONFIG_SUBPELREFMV),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
+endif
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
--
⑨