shithub: libvpx

Download patch

ref: 1309679e96f82ef32f5d56d635342557b4ff7945
parent: 5957b2b514f12fcb9549f3a97ec82dbab34f1ad7
parent: f60f6db7162e9ce15e6adbcab29851a2fa6b0033
author: Yaowu Xu <yaowu@google.com>
date: Mon Mar 4 08:13:29 EST 2013

Merge "Rename quantize_sse2.c to quantize_sse2_intrinsics.c to avoid collision." into experimental

--- a/vp8/encoder/x86/quantize_sse2.c
+++ /dev/null
@@ -1,103 +1,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/common/blockd.h"
-#include "vp8/common/entropy.h"
-#include "vp8/encoder/block.h"
-
-#include <mmintrin.h> //MMX
-#include <xmmintrin.h> //SSE
-#include <emmintrin.h> //SSE2
-
-void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
-{
-  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
-  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
-  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
-  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
-  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
-  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
-  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
-  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
-  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
-
-  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
-
-  /* sign of z: z >> 15 */
-  sz0 = _mm_srai_epi16(z0, 15);
-  sz1 = _mm_srai_epi16(z1, 15);
-
-  /* x = abs(z): (z ^ sz) - sz */
-  x0 = _mm_xor_si128(z0, sz0);
-  x1 = _mm_xor_si128(z1, sz1);
-  x0 = _mm_sub_epi16(x0, sz0);
-  x1 = _mm_sub_epi16(x1, sz1);
-
-  /* x += round */
-  x0 = _mm_add_epi16(x0, round0);
-  x1 = _mm_add_epi16(x1, round1);
-
-  /* y = (x * quant) >> 16 */
-  y0 = _mm_mulhi_epi16(x0, quant_fast0);
-  y1 = _mm_mulhi_epi16(x1, quant_fast1);
-
-  /* x = abs(y) = (y ^ sz) - sz */
-  y0 = _mm_xor_si128(y0, sz0);
-  y1 = _mm_xor_si128(y1, sz1);
-  x0 = _mm_sub_epi16(y0, sz0);
-  x1 = _mm_sub_epi16(y1, sz1);
-
-  /* qcoeff = x */
-  _mm_store_si128((__m128i *)(d->qcoeff), x0);
-  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
-
-  /* x * dequant */
-  xdq0 = _mm_mullo_epi16(x0, dequant0);
-  xdq1 = _mm_mullo_epi16(x1, dequant1);
-
-  /* dqcoeff = x * dequant */
-  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
-  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
-
-  /* build a mask for the zig zag */
-  zeros = _mm_setzero_si128();
-
-  x0 = _mm_cmpeq_epi16(x0, zeros);
-  x1 = _mm_cmpeq_epi16(x1, zeros);
-
-  ones = _mm_cmpeq_epi16(zeros, zeros);
-
-  x0 = _mm_xor_si128(x0, ones);
-  x1 = _mm_xor_si128(x1, ones);
-
-  x0 = _mm_and_si128(x0, inv_zig_zag0);
-  x1 = _mm_and_si128(x1, inv_zig_zag1);
-
-  x0 = _mm_max_epi16(x0, x1);
-
-  /* now down to 8 */
-  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
-
-  x0 = _mm_max_epi16(x0, x1);
-
-  /* only 4 left */
-  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
-
-  x0 = _mm_max_epi16(x0, x1);
-
-  /* okay, just 2! */
-  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
-
-  x0 = _mm_max_epi16(x0, x1);
-
-  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
-}
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c
@@ -1,0 +1,103 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropy.h"
+#include "vp8/encoder/block.h"
+
+#include <mmintrin.h> //MMX
+#include <xmmintrin.h> //SSE
+#include <emmintrin.h> //SSE2
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z): (z ^ sz) - sz */
+  x0 = _mm_xor_si128(z0, sz0);
+  x1 = _mm_xor_si128(z1, sz1);
+  x0 = _mm_sub_epi16(x0, sz0);
+  x1 = _mm_sub_epi16(x1, sz1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* x = abs(y) = (y ^ sz) - sz */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  xdq0 = _mm_mullo_epi16(x0, dequant0);
+  xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+  /* build a mask for the zig zag */
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpeq_epi16(x0, zeros);
+  x1 = _mm_cmpeq_epi16(x1, zeros);
+
+  ones = _mm_cmpeq_epi16(zeros, zeros);
+
+  x0 = _mm_xor_si128(x0, ones);
+  x1 = _mm_xor_si128(x1, ones);
+
+  x0 = _mm_and_si128(x0, inv_zig_zag0);
+  x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* now down to 8 */
+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* only 4 left */
+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* okay, just 2! */
+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,13 +89,13 @@
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 
 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2_intrinsics.c.o: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2_intrinsics.c.d: CFLAGS += -msse2
 endif
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)