ref: 5ac141187a38175eb4981a370678e0f44964d324
parent: d6ff6fe2edb774ae979447aa394eeade35e21d18
parent: 7ad8dbe417b03619855892e32726b75de9e9da1a
author: John Koleszar <jkoleszar@google.com>
date: Wed Feb 27 07:23:45 EST 2013
Merge "Remove unused vp9_copy32xn" into experimental
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -491,16 +491,6 @@
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
-#
-# Block copy
-#
-case $arch in
- x86*)
- prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
- specialize vp9_copy32xn sse2 sse3
- ;;
-esac
-
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1683,14 +1683,6 @@
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
-#if ARCH_X86 || ARCH_X86_64
- cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn;
- cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn;
-#endif
-
cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = vp9_diamond_search_sad;
cpi->refining_search_sad = vp9_refining_search_sad;
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -484,61 +484,3 @@
sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-
-/* Copy 2 macroblocks to a buffer */
-void vp9_copy32xn_c(uint8_t *src_ptr,
- int src_stride,
- uint8_t *dst_ptr,
- int dst_stride,
- int height) {
- int r;
-
- for (r = 0; r < height; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst_ptr[0] = src_ptr[0];
- dst_ptr[1] = src_ptr[1];
- dst_ptr[2] = src_ptr[2];
- dst_ptr[3] = src_ptr[3];
- dst_ptr[4] = src_ptr[4];
- dst_ptr[5] = src_ptr[5];
- dst_ptr[6] = src_ptr[6];
- dst_ptr[7] = src_ptr[7];
- dst_ptr[8] = src_ptr[8];
- dst_ptr[9] = src_ptr[9];
- dst_ptr[10] = src_ptr[10];
- dst_ptr[11] = src_ptr[11];
- dst_ptr[12] = src_ptr[12];
- dst_ptr[13] = src_ptr[13];
- dst_ptr[14] = src_ptr[14];
- dst_ptr[15] = src_ptr[15];
- dst_ptr[16] = src_ptr[16];
- dst_ptr[17] = src_ptr[17];
- dst_ptr[18] = src_ptr[18];
- dst_ptr[19] = src_ptr[19];
- dst_ptr[20] = src_ptr[20];
- dst_ptr[21] = src_ptr[21];
- dst_ptr[22] = src_ptr[22];
- dst_ptr[23] = src_ptr[23];
- dst_ptr[24] = src_ptr[24];
- dst_ptr[25] = src_ptr[25];
- dst_ptr[26] = src_ptr[26];
- dst_ptr[27] = src_ptr[27];
- dst_ptr[28] = src_ptr[28];
- dst_ptr[29] = src_ptr[29];
- dst_ptr[30] = src_ptr[30];
- dst_ptr[31] = src_ptr[31];
-#else
- ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];
- ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];
- ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];
- ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];
- ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];
- ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];
- ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];
- ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];
-#endif
- src_ptr += src_stride;
- dst_ptr += dst_stride;
-
- }
-}
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -19,12 +19,6 @@
int ref_stride,
unsigned int max_sad);
-typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- int n);
-
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -79,7 +73,6 @@
vp9_sad_multi_fn_t sdx3f;
vp9_sad_multi1_fn_t sdx8f;
vp9_sad_multi_d_fn_t sdx4df;
- vp9_copy32xn_fn_t copymem;
} vp9_variance_fn_ptr_t;
#endif // VP9_ENCODER_VP9_VARIANCE_H_
--- /dev/null
+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm
@@ -1,0 +1,225 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+ movd m0, [srcq +%2]
+%if %1 == 1
+ movd m6, [ref1q+%3]
+ movd m4, [ref2q+%3]
+ movd m7, [ref3q+%3]
+ movd m5, [ref4q+%3]
+ punpckldq m0, [srcq +%4]
+ punpckldq m6, [ref1q+%5]
+ punpckldq m4, [ref2q+%5]
+ punpckldq m7, [ref3q+%5]
+ punpckldq m5, [ref4q+%5]
+ psadbw m6, m0
+ psadbw m4, m0
+ psadbw m7, m0
+ psadbw m5, m0
+ punpckldq m6, m4
+ punpckldq m7, m5
+%else
+ movd m1, [ref1q+%3]
+ movd m2, [ref2q+%3]
+ movd m3, [ref3q+%3]
+ movd m4, [ref4q+%3]
+ punpckldq m0, [srcq +%4]
+ punpckldq m1, [ref1q+%5]
+ punpckldq m2, [ref2q+%5]
+ punpckldq m3, [ref3q+%5]
+ punpckldq m4, [ref4q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ psadbw m4, m0
+ punpckldq m1, m2
+ punpckldq m3, m4
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+ movh m0, [srcq +%2]
+%if %1 == 1
+ movh m4, [ref1q+%3]
+ movh m5, [ref2q+%3]
+ movh m6, [ref3q+%3]
+ movh m7, [ref4q+%3]
+ movhps m0, [srcq +%4]
+ movhps m4, [ref1q+%5]
+ movhps m5, [ref2q+%5]
+ movhps m6, [ref3q+%5]
+ movhps m7, [ref4q+%5]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q+%3]
+ movh m2, [ref2q+%3]
+ movh m3, [ref3q+%3]
+ movhps m0, [srcq +%4]
+ movhps m1, [ref1q+%5]
+ movhps m2, [ref2q+%5]
+ movhps m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movh m1, [ref4q+%3]
+ movhps m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+ ; 1st 16 px
+ mova m0, [srcq +%2]
+%if %1 == 1
+ movu m4, [ref1q+%3]
+ movu m5, [ref2q+%3]
+ movu m6, [ref3q+%3]
+ movu m7, [ref4q+%3]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movu m1, [ref1q+%3]
+ movu m2, [ref2q+%3]
+ movu m3, [ref3q+%3]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%3]
+ paddd m5, m2
+ paddd m6, m3
+ psadbw m1, m0
+ paddd m7, m1
+%endif
+
+ ; 2nd 16 px
+ mova m0, [srcq +%4]
+ movu m1, [ref1q+%5]
+ movu m2, [ref2q+%5]
+ movu m3, [ref3q+%5]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ paddd m4, m1
+ movu m1, [ref4q+%5]
+ paddd m5, m2
+ paddd m6, m3
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*2]
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endif
+ psadbw m1, m0
+ paddd m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+ PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+ PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+ PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+ PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; unsigned int res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro SADNXN4D 2
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+ PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+ PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if mmsize == 16
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ movifnidn r4, r4mp
+ paddd m4, m5
+ movu [r4], m4
+ RET
+%else
+ movifnidn r4, r4mp
+ movq [r4+0], m6
+ movq [r4+8], m7
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 64, 64
+SADNXN4D 32, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+
+INIT_MMX sse
+SADNXN4D 4, 4
--- a/vp9/encoder/x86/vp9_sad4d_sse2_yasm.asm
+++ /dev/null
@@ -1,225 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
- movd m0, [srcq +%2]
-%if %1 == 1
- movd m6, [ref1q+%3]
- movd m4, [ref2q+%3]
- movd m7, [ref3q+%3]
- movd m5, [ref4q+%3]
- punpckldq m0, [srcq +%4]
- punpckldq m6, [ref1q+%5]
- punpckldq m4, [ref2q+%5]
- punpckldq m7, [ref3q+%5]
- punpckldq m5, [ref4q+%5]
- psadbw m6, m0
- psadbw m4, m0
- psadbw m7, m0
- psadbw m5, m0
- punpckldq m6, m4
- punpckldq m7, m5
-%else
- movd m1, [ref1q+%3]
- movd m2, [ref2q+%3]
- movd m3, [ref3q+%3]
- movd m4, [ref4q+%3]
- punpckldq m0, [srcq +%4]
- punpckldq m1, [ref1q+%5]
- punpckldq m2, [ref2q+%5]
- punpckldq m3, [ref3q+%5]
- punpckldq m4, [ref4q+%5]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- psadbw m4, m0
- punpckldq m1, m2
- punpckldq m3, m4
- paddd m6, m1
- paddd m7, m3
-%endif
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
- movh m0, [srcq +%2]
-%if %1 == 1
- movh m4, [ref1q+%3]
- movh m5, [ref2q+%3]
- movh m6, [ref3q+%3]
- movh m7, [ref4q+%3]
- movhps m0, [srcq +%4]
- movhps m4, [ref1q+%5]
- movhps m5, [ref2q+%5]
- movhps m6, [ref3q+%5]
- movhps m7, [ref4q+%5]
- psadbw m4, m0
- psadbw m5, m0
- psadbw m6, m0
- psadbw m7, m0
-%else
- movh m1, [ref1q+%3]
- movh m2, [ref2q+%3]
- movh m3, [ref3q+%3]
- movhps m0, [srcq +%4]
- movhps m1, [ref1q+%5]
- movhps m2, [ref2q+%5]
- movhps m3, [ref3q+%5]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movh m1, [ref4q+%3]
- movhps m1, [ref4q+%5]
- paddd m5, m2
- paddd m6, m3
- psadbw m1, m0
- paddd m7, m1
-%endif
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
- ; 1st 16 px
- mova m0, [srcq +%2]
-%if %1 == 1
- movu m4, [ref1q+%3]
- movu m5, [ref2q+%3]
- movu m6, [ref3q+%3]
- movu m7, [ref4q+%3]
- psadbw m4, m0
- psadbw m5, m0
- psadbw m6, m0
- psadbw m7, m0
-%else
- movu m1, [ref1q+%3]
- movu m2, [ref2q+%3]
- movu m3, [ref3q+%3]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movu m1, [ref4q+%3]
- paddd m5, m2
- paddd m6, m3
- psadbw m1, m0
- paddd m7, m1
-%endif
-
- ; 2nd 16 px
- mova m0, [srcq +%4]
- movu m1, [ref1q+%5]
- movu m2, [ref2q+%5]
- movu m3, [ref3q+%5]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movu m1, [ref4q+%5]
- paddd m5, m2
- paddd m6, m3
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
- psadbw m1, m0
- paddd m7, m1
-%endmacro
-
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
- PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
- PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
- PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
- PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
-%endmacro
-
-; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref[4], int ref_stride,
-; unsigned int res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro SADNXN4D 2
-%if UNIX64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
- res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
- ref2, ref3, ref4
-%endif
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- mov ref2q, [ref1q+gprsize*1]
- mov ref3q, [ref1q+gprsize*2]
- mov ref4q, [ref1q+gprsize*3]
- mov ref1q, [ref1q+gprsize*0]
-
- PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
- PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
- PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-
-%if mmsize == 16
- pslldq m5, 4
- pslldq m7, 4
- por m4, m5
- por m6, m7
- mova m5, m4
- mova m7, m6
- punpcklqdq m4, m6
- punpckhqdq m5, m7
- movifnidn r4, r4mp
- paddd m4, m5
- movu [r4], m4
- RET
-%else
- movifnidn r4, r4mp
- movq [r4+0], m6
- movq [r4+8], m7
- RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-SADNXN4D 64, 64
-SADNXN4D 32, 32
-SADNXN4D 16, 16
-SADNXN4D 16, 8
-SADNXN4D 8, 16
-SADNXN4D 8, 8
-
-INIT_MMX sse
-SADNXN4D 4, 4
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -8,85 +8,175 @@
; be found in the AUTHORS file in the root of the source tree.
;
+%include "third_party/x86inc/x86inc.asm"
-%include "vpx_ports/x86_abi_support.asm"
+SECTION .text
-;void vp9_copy32xn_sse2(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp9_copy32xn_sse2) PRIVATE
-sym(vp9_copy32xn_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+INIT_XMM sse2
+cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov n_rowsd, 64
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;dst_ptr
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;dst_stride
- movsxd rcx, dword ptr arg(4) ;height
+; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+INIT_XMM sse2
+cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov n_rowsd, 16
+ pxor m0, m0
-.block_copy_sse2_loopx4:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- movdqu xmm2, XMMWORD PTR [rsi + rax]
- movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
- lea rsi, [rsi+rax*2]
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
- movdqu xmm4, XMMWORD PTR [rsi]
- movdqu xmm5, XMMWORD PTR [rsi + 16]
- movdqu xmm6, XMMWORD PTR [rsi + rax]
- movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1
+cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+ mov n_rowsd, %1/4
+ pxor m0, m0
- lea rsi, [rsi+rax*2]
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- movdqa XMMWORD PTR [rdi + rdx], xmm2
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
- lea rdi, [rdi+rdx*2]
+INIT_XMM sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
- movdqa XMMWORD PTR [rdi], xmm4
- movdqa XMMWORD PTR [rdi + 16], xmm5
- movdqa XMMWORD PTR [rdi + rdx], xmm6
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1
+cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+ mov n_rowsd, %1/4
+ pxor m0, m0
- lea rdi, [rdi+rdx*2]
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
- sub rcx, 4
- cmp rcx, 4
- jge .block_copy_sse2_loopx4
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
- cmp rcx, 0
- je .copy_is_done
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
-.block_copy_sse2_loop:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- lea rsi, [rsi+rax]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- lea rdi, [rdi+rdx]
-
- sub rcx, 1
- jne .block_copy_sse2_loop
-
-.copy_is_done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
+; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+INIT_MMX sse
+cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ movd m0, [refq]
+ movd m1, [refq+ref_strideq]
+ movd m2, [srcq]
+ movd m3, [srcq+src_strideq]
+ lea refq, [refq+ref_strideq*2]
+ lea srcq, [srcq+src_strideq*2]
+ movd m4, [refq]
+ movd m5, [refq+ref_strideq]
+ movd m6, [srcq]
+ movd m7, [srcq+src_strideq]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ psadbw m0, m2
+ psadbw m4, m6
+ paddd m0, m4
+ movd eax, m0
+ RET
--- a/vp9/encoder/x86/vp9_sad_sse2_yasm.asm
+++ /dev/null
@@ -1,182 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-INIT_XMM sse2
-cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- mov n_rowsd, 64
- pxor m0, m0
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
- psadbw m1, [srcq]
- psadbw m2, [srcq+16]
- psadbw m3, [srcq+32]
- psadbw m4, [srcq+48]
- paddd m1, m2
- paddd m3, m4
- add refq, ref_strideq
- paddd m0, m1
- add srcq, src_strideq
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-
-; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-INIT_XMM sse2
-cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- mov n_rowsd, 16
- pxor m0, m0
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+ref_strideq]
- movu m4, [refq+ref_strideq+16]
- psadbw m1, [srcq]
- psadbw m2, [srcq+16]
- psadbw m3, [srcq+src_strideq]
- psadbw m4, [srcq+src_strideq+16]
- paddd m1, m2
- paddd m3, m4
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-
-; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
- mov n_rowsd, %1/4
- pxor m0, m0
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+ref_strideq]
- movu m3, [refq+ref_strideq*2]
- movu m4, [refq+ref_stride3q]
- psadbw m1, [srcq]
- psadbw m2, [srcq+src_strideq]
- psadbw m3, [srcq+src_strideq*2]
- psadbw m4, [srcq+src_stride3q]
- paddd m1, m2
- paddd m3, m4
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN 8 ; sad16x8_sse2
-
-; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
- mov n_rowsd, %1/4
- pxor m0, m0
-
-.loop:
- movh m1, [refq]
- movhps m1, [refq+ref_strideq]
- movh m2, [refq+ref_strideq*2]
- movhps m2, [refq+ref_stride3q]
- movh m3, [srcq]
- movhps m3, [srcq+src_strideq]
- movh m4, [srcq+src_strideq*2]
- movhps m4, [srcq+src_stride3q]
- psadbw m1, m3
- psadbw m2, m4
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m2
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN 8 ; sad8x8_sse2
-
-; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-INIT_MMX sse
-cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- movd m0, [refq]
- movd m1, [refq+ref_strideq]
- movd m2, [srcq]
- movd m3, [srcq+src_strideq]
- lea refq, [refq+ref_strideq*2]
- lea srcq, [srcq+src_strideq*2]
- movd m4, [refq]
- movd m5, [refq+ref_strideq]
- movd m6, [srcq]
- movd m7, [srcq+src_strideq]
- punpckldq m0, m1
- punpckldq m2, m3
- punpckldq m4, m5
- punpckldq m6, m7
- psadbw m0, m2
- psadbw m4, m6
- paddd m0, m4
- movd eax, m0
- RET
--- a/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/vp9/encoder/x86/vp9_sad_sse3.asm
@@ -376,64 +376,3 @@
movd [rcx+8], mm7
STACK_FRAME_DESTROY_X3
-
-;void vp9_copy32xn_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp9_copy32xn_sse3) PRIVATE
-sym(vp9_copy32xn_sse3):
-
- STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
- lea end_ptr, [src_ptr+src_stride*2]
-
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
- movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
- movdqu xmm4, XMMWORD PTR [end_ptr]
- movdqu xmm5, XMMWORD PTR [end_ptr + 16]
- movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
- movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
-
- lea src_ptr, [src_ptr+src_stride*4]
-
- lea end_ptr, [ref_ptr+ref_stride*2]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
- movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
- movdqa XMMWORD PTR [end_ptr], xmm4
- movdqa XMMWORD PTR [end_ptr + 16], xmm5
- movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
- movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
- lea ref_ptr, [ref_ptr+ref_stride*4]
-
- sub height, 4
- cmp height, 4
- jge .block_copy_sse3_loopx4
-
- ;Check to see if there is more rows need to be copied.
- cmp height, 0
- je .copy_is_done
-
-.block_copy_sse3_loop:
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- lea src_ptr, [src_ptr+src_stride]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- lea ref_ptr, [ref_ptr+ref_stride]
-
- sub height, 1
- jne .block_copy_sse3_loop
-
-.copy_is_done:
- STACK_FRAME_DESTROY_X3
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -94,8 +94,7 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2_yasm.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2_yasm.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
--
⑨