shithub: libvpx

Download patch

ref: d393885af195a58e37e6966352708b3b04892c90
parent: c325fb748a6c395ccca392e1fb5dadefb32a1cec
author: Johann <johannkoenig@google.com>
date: Tue Aug 23 12:58:07 EDT 2016

Remove halfpix specialization

This function only exists as a shortcut to subpixel variance with
predefined offsets. xoffset = 4 for horizontal, yoffset = 4 for vertical
and both for "hv"

Removing this allows the existing optimizations for the variance
functions to be called. Instead of having only sse2 optimizations, this
gives sse2, ssse3, msa and neon.

BUG=webm:1273

Change-Id: Ieb407b423b91b87d33c4263c6a1ad5e673b0efd6

--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -409,7 +409,8 @@
   /* go left then right and check error */
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" horizontal variance */
+  thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (left < bestmse) {
@@ -420,7 +421,8 @@
   }
 
   this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" horizontal variance */
+  thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (right < bestmse) {
@@ -433,7 +435,8 @@
   /* go up then down and check error */
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" vertical variance */
+  thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (up < bestmse) {
@@ -444,7 +447,8 @@
   }
 
   this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" vertical variance */
+  thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (down < bestmse) {
@@ -462,25 +466,28 @@
     case 0:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
-                                    b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse =
+          vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 1:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse =
-          vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 2:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 3:
     default:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
   }
 
@@ -698,7 +705,8 @@
   /* go left then right and check error */
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" horizontal variance */
+  thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (left < bestmse) {
@@ -709,7 +717,8 @@
   }
 
   this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" horizontal variance */
+  thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (right < bestmse) {
@@ -722,7 +731,8 @@
   /* go up then down and check error */
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" vertical variance */
+  thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (up < bestmse) {
@@ -733,7 +743,8 @@
   }
 
   this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  /* "halfpix" vertical variance */
+  thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
   if (down < bestmse) {
@@ -751,25 +762,28 @@
     case 0:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
-                                    b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse =
+          vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 1:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse =
-          vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 2:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
     case 3:
     default:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      /* "halfpix" horizontal/vertical variance */
+      thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
       break;
   }
 
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1914,9 +1914,6 @@
   cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
   cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
   cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
-  cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
   cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
   cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
   cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
@@ -1924,9 +1921,6 @@
   cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
   cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
   cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
-  cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
   cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
   cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
   cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
@@ -1934,9 +1928,6 @@
   cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
   cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
   cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
-  cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
   cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
   cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
   cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
@@ -1944,9 +1935,6 @@
   cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
   cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
   cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
-  cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
   cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
   cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
   cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
@@ -1954,9 +1942,6 @@
   cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
   cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
   cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
-  cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
   cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
   cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
   cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -49,24 +49,6 @@
   return sum;
 }
 
-uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
-                                           const uint8_t *b, int b_stride,
-                                           uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
-}
-
 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
   int i, j;
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -58,9 +58,6 @@
   vpx_sad_fn_t sdf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
-  vpx_variance_fn_t svf_halfpix_h;
-  vpx_variance_fn_t svf_halfpix_v;
-  vpx_variance_fn_t svf_halfpix_hv;
   vpx_sad_multi_fn_t sdx3f;
   vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -310,8 +310,6 @@
 
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1511,23 +1511,6 @@
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
 
-#
-# Specialty Subpixel
-#
-# TODO(johannkoenig): Add neon implementations of
-#  vpx_variance_halfpixvar16x16_h
-#  vpx_variance_halfpixvar16x16_v
-#  vpx_variance_halfpixvar16x16_hv
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h sse2/;
-
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v sse2/;
-
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv sse2/;
-
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance64x64 sse2/;
--- a/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
+++ /dev/null
@@ -1,346 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
-;                                            int ref_stride,
-;                                            unsigned char *src,
-;                                            int src_stride,
-;                                            unsigned int height,
-;                                            int *sum,
-;                                            unsigned int *sumsquared)
-global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vpx_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref
-
-        mov             rdi,            arg(2) ;src
-        movsxd          rcx,            dword ptr arg(4) ;height
-        movsxd          rax,            dword ptr arg(1) ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-vpx_half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vpx_half_horiz_vert_variance16x_h_1     ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
-;                                      int ref_stride,
-;                                      unsigned char *src,
-;                                      int src_stride,
-;                                      unsigned int height,
-;                                      int *sum,
-;                                      unsigned int *sumsquared)
-global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
-sym(vpx_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref
-
-        mov             rdi,            arg(2)              ;src
-        movsxd          rcx,            dword ptr arg(4)    ;height
-        movsxd          rax,            dword ptr arg(1)    ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-vpx_half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             vpx_half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
-;                                       int ref_stride
-;                                       unsigned char *src,
-;                                       int src_stride,
-;                                       unsigned int height,
-;                                       int *sum,
-;                                       unsigned int *sumsquared)
-global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vpx_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref
-
-        mov             rdi,            arg(2) ;src
-        movsxd          rcx,            dword ptr arg(4) ;height
-        movsxd          rax,            dword ptr arg(1) ;ref_stride
-        movsxd          rdx,            dword ptr arg(3)    ;src_stride
-
-        pxor            xmm0,           xmm0                ;
-
-vpx_half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vpx_half_horiz_variance16x_h_1        ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vpx_bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ /dev/null
@@ -1,76 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
-                                            int ref_stride,
-                                            const unsigned char *src,
-                                            int src_stride, unsigned int height,
-                                            int *sum, unsigned int *sumsquared);
-void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
-                                       const unsigned char *src, int src_stride,
-                                       unsigned int height, int *sum,
-                                       unsigned int *sumsquared);
-void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
-                                      const unsigned char *src, int src_stride,
-                                      unsigned int height, int *sum,
-                                      unsigned int *sumsquared);
-
-uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
-                                             int src_stride,
-                                             const unsigned char *dst,
-                                             int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vpx_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
-                                    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
-                                             int src_stride,
-                                             const unsigned char *dst,
-                                             int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
-                                   &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
-                                              int src_stride,
-                                              const unsigned char *dst,
-                                              int dst_stride, uint32_t *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vpx_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
-                                         &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  assert(xsum0 <= 255 * 16 * 16);
-  assert(xsum0 >= -255 * 16 * 16);
-  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}