ref: a70861c43598fce722843f8e7c765b1b154fb17c
parent: 58a497dc29a722412f62fa3d9ea9bcd42abf424b
parent: d393885af195a58e37e6966352708b3b04892c90
author: Johann Koenig <johannkoenig@google.com>
date: Fri Aug 26 17:28:01 EDT 2016
Merge "Remove halfpix specialization"
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -409,7 +409,8 @@
/* go left then right and check error */
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal variance */
+ thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse) {
@@ -420,7 +421,8 @@
}
this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal variance */
+ thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse) {
@@ -433,7 +435,8 @@
/* go up then down and check error */
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" vertical variance */
+ thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse) {
@@ -444,7 +447,8 @@
}
this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" vertical variance */
+ thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse) {
@@ -462,25 +466,28 @@
case 0:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
- b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse =
+ vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 1:
this_mv.as_mv.col += 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse =
- vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 2:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 3:
default:
this_mv.as_mv.col += 4;
this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
break;
}
@@ -698,7 +705,8 @@
/* go left then right and check error */
this_mv.as_mv.row = startmv.as_mv.row;
this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
- thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal variance */
+ thismse = vfp->svf(y - 1, y_stride, 4, 0, z, b->src_stride, &sse);
left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (left < bestmse) {
@@ -709,7 +717,8 @@
}
this_mv.as_mv.col += 8;
- thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal variance */
+ thismse = vfp->svf(y, y_stride, 4, 0, z, b->src_stride, &sse);
right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (right < bestmse) {
@@ -722,7 +731,8 @@
/* go up then down and check error */
this_mv.as_mv.col = startmv.as_mv.col;
this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
- thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" vertical variance */
+ thismse = vfp->svf(y - y_stride, y_stride, 0, 4, z, b->src_stride, &sse);
up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (up < bestmse) {
@@ -733,7 +743,8 @@
}
this_mv.as_mv.row += 8;
- thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" vertical variance */
+ thismse = vfp->svf(y, y_stride, 0, 4, z, b->src_stride, &sse);
down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
if (down < bestmse) {
@@ -751,25 +762,28 @@
case 0:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z,
- b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse =
+ vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 1:
this_mv.as_mv.col += 4;
this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
- thismse =
- vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 2:
this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse);
break;
case 3:
default:
this_mv.as_mv.col += 4;
this_mv.as_mv.row += 4;
- thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ /* "halfpix" horizontal/vertical variance */
+ thismse = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse);
break;
}
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1914,9 +1914,6 @@
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
@@ -1924,9 +1921,6 @@
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
- cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
- cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
- cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
@@ -1934,9 +1928,6 @@
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
- cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
- cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
- cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
@@ -1944,9 +1935,6 @@
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
- cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
- cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
- cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
@@ -1954,9 +1942,6 @@
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
- cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
- cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
- cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -49,24 +49,6 @@
return sum;
}
-uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
-}
-
static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse, int *sum) {
int i, j;
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -58,9 +58,6 @@
vpx_sad_fn_t sdf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
- vpx_variance_fn_t svf_halfpix_h;
- vpx_variance_fn_t svf_halfpix_v;
- vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -310,8 +310,6 @@
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
-DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1511,23 +1511,6 @@
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-#
-# Specialty Subpixel
-#
-# TODO(johannkoenig): Add neon implementations of
-# vpx_variance_halfpixvar16x16_h
-# vpx_variance_halfpixvar16x16_v
-# vpx_variance_halfpixvar16x16_hv
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_variance_halfpixvar16x16_h sse2/;
-
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_variance_halfpixvar16x16_v sse2/;
-
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_variance_halfpixvar16x16_hv sse2/;
-
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x64 sse2/;
--- a/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
+++ /dev/null
@@ -1,346 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vpx_half_horiz_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- pxor xmm0, xmm0 ;
-
- movdqu xmm5, XMMWORD PTR [rsi]
- movdqu xmm3, XMMWORD PTR [rsi+1]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
- lea rsi, [rsi + rax]
-
-vpx_half_horiz_vert_variance16x_h_1:
- movdqu xmm1, XMMWORD PTR [rsi] ;
- movdqu xmm2, XMMWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
-
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm4, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- psubw xmm5, xmm3 ; xmm5 -= xmm3
-
- movq xmm3, QWORD PTR [rdi+8]
- punpcklbw xmm3, xmm0
- psubw xmm4, xmm3
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz vpx_half_horiz_vert_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
-sym(vpx_half_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- movdqu xmm5, XMMWORD PTR [rsi]
- lea rsi, [rsi + rax ]
- pxor xmm0, xmm0
-
-vpx_half_vert_variance16x_h_1:
- movdqu xmm3, XMMWORD PTR [rsi]
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0
- punpckhbw xmm4, xmm0
-
- movq xmm2, QWORD PTR [rdi]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm2
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm4, xmm2
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm3
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1
- jnz vpx_half_vert_variance16x_h_1
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vpx_half_horiz_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- pxor xmm0, xmm0 ;
-
-vpx_half_horiz_variance16x_h_1:
- movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
- movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm1, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm1, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- psubw xmm1, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm1
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm1, xmm1
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz vpx_half_horiz_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-vpx_bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ /dev/null
@@ -1,76 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
- int ref_stride,
- const unsigned char *src,
- int src_stride, unsigned int height,
- int *sum, unsigned int *sumsquared);
-void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-
-uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- vpx_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
- vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
- &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- vpx_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}