shithub: libvpx

Download patch

ref: f1c56a8c8cb0959734a14446c60c132de676388a
parent: 879cb7d96259a71eea0038452a00241650589084
parent: 8d568312a2e6882a336eb3525fbe6b9e752163f3
author: Yunqing Wang <yunqingwang@google.com>
date: Tue Jan 8 07:59:08 EST 2013

Merge "vp9_sub_pixel_variance16x2 SSE2 optimization" into experimental

--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -185,33 +185,33 @@
     offset = ref_y_stride * row_offset + col_offset;
     score = 0;
     if (xd->up_available) {
-      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
-                                   SP(this_mv.as_mv.col),
-                                   SP(this_mv.as_mv.row),
-                                   above_src, xd->dst.y_stride, &sse);
+      vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
+                                 SP(this_mv.as_mv.col),
+                                 SP(this_mv.as_mv.row),
+                                 above_src, xd->dst.y_stride, &sse);
       score += sse;
 #if CONFIG_SUPERBLOCKS
       if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-        vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 16, xd->dst.y_stride, &sse);
+        vp9_sub_pixel_variance16x2(above_ref + offset + 16,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 16, xd->dst.y_stride, &sse);
         score += sse;
       }
 #if CONFIG_SUPERBLOCKS64
       if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
-        vp9_sub_pixel_variance16x2_c(above_ref + offset + 32,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 32, xd->dst.y_stride, &sse);
+        vp9_sub_pixel_variance16x2(above_ref + offset + 32,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 32, xd->dst.y_stride, &sse);
         score += sse;
-        vp9_sub_pixel_variance16x2_c(above_ref + offset + 48,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 48, xd->dst.y_stride, &sse);
+        vp9_sub_pixel_variance16x2(above_ref + offset + 48,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 48, xd->dst.y_stride, &sse);
         score += sse;
       }
 #endif
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -254,6 +254,11 @@
 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad3x16 sse2
 
+if [ "$CONFIG_SUBPELREFMV" = "yes" ]; then
+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x2 sse2
+fi
+
 #
 # Sub Pixel Filters
 #
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c
@@ -1,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define HALFNDX 8
+
+void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                       int ref_pixels_per_line,
+                                       const unsigned char *src_ptr,
+                                       int src_pixels_per_line,
+                                       unsigned int Height,
+                                       int *sum,
+                                       unsigned int *sumsquared);
+
+void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                      int ref_pixels_per_line,
+                                      const unsigned char *src_ptr,
+                                      int src_pixels_per_line,
+                                      unsigned int Height,
+                                      int *sum,
+                                      unsigned int *sumsquared);
+
+void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                            int ref_pixels_per_line,
+                                            const unsigned char *src_ptr,
+                                            int src_pixels_per_line,
+                                            unsigned int Height,
+                                            int *sum,
+                                            unsigned int *sumsquared);
+
+void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr,
+                                     int ref_pixels_per_line,
+                                     const unsigned char *src_ptr,
+                                     int src_pixels_per_line,
+                                     unsigned int Height,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     int *sum,
+                                     unsigned int *sumsquared);
+
+unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char  *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const unsigned char *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr + 8, src_pixels_per_line,
+      dst_ptr + 8, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum1, &xxsum1);
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
+}
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -96,6 +96,9 @@
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+ifeq ($(CONFIG_SUBPELREFMV),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
+endif
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)