ref: c8f6ed77b95bfa376008e54a24bdce5485bba5ca
parent: 3a6a81fc9adace4e31522215ab9baa2bdc7d0079
parent: 291033032ed1abf4f43ad52165caea31a35fb33d
author: James Bankoski <jimbankoski@google.com>
date: Wed Jun 29 19:20:25 EDT 2016
Merge "Revert "libyuv: update to b8ddb5a2""
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: b8ddb5a2
+Version: 1456
License: BSD
License File: LICENSE
@@ -13,13 +13,3 @@
in order to encode multiple resolution bit streams.
Local Modifications:
-
-Remove files unnecessary to libvpx build.
-
-rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \
- LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \
- all.gyp build_overrides/ chromium/ codereview.settings docs/ \
- download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \
- include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \
- libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \
- third_party/ tools/ unit_test/ util/ winarm.mk
--- a/third_party/libyuv/include/libyuv/convert.h
+++ b/third_party/libyuv/include/libyuv/convert.h
@@ -12,8 +12,10 @@
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
-
-#include "libyuv/rotate.h" // For enum RotationMode.
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
#ifdef __cplusplus
namespace libyuv {
--- a/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/third_party/libyuv/include/libyuv/convert_argb.h
@@ -12,9 +12,11 @@
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
-#include "libyuv/rotate.h" // For enum RotationMode.
-
// TODO(fbarchard): This set of functions should exactly match convert.h
// TODO(fbarchard): Add tests. Create random content of right size and convert
// with C vs Opt and or to I420 and compare.
@@ -58,22 +60,6 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
-
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -82,24 +68,6 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Convert I420 with Alpha to preattenuated ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int attenuate);
-
-// Convert I420 with Alpha to preattenuated ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height, int attenuate);
-
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
@@ -161,54 +129,6 @@
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height);
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// BGRA little endian (argb in memory) to ARGB.
--- a/third_party/libyuv/include/libyuv/convert_from.h
+++ b/third_party/libyuv/include/libyuv/convert_from.h
@@ -56,6 +56,8 @@
uint8* dst_y, int dst_stride_y,
int width, int height);
+// TODO(fbarchard): I420ToM420
+
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -18,8 +18,9 @@
extern "C" {
#endif
+// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
-static const int kCpuInitialized = 0x1;
+#define kCpuInit 0x1
// These flags are only valid on ARM processors.
static const int kCpuHasARM = 0x2;
@@ -36,12 +37,12 @@
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
@@ -56,13 +57,13 @@
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
- return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
+ return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
LIBYUV_API
void MaskCpuFlags(int enable_flags);
--- a/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -145,6 +145,13 @@
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
@@ -170,14 +177,6 @@
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height);
-
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
@@ -282,20 +281,14 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Copy Alpha channel of ARGB to alpha of ARGB.
+// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Extract the alpha channel from ARGB.
+// Copy ARGB to ARGB.
LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
- uint8* dst_a, int dst_stride_a,
- int width, int height);
-
-// Copy Y channel to Alpha of ARGB.
-LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
@@ -308,7 +301,6 @@
ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@@ -316,31 +308,6 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
- const uint8* src_u0, int src_stride_u0,
- const uint8* src_v0, int src_stride_v0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* src_u1, int src_stride_u1,
- const uint8* src_v1, int src_stride_v1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@@ -390,6 +357,12 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h, int dw, int dh);
+
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
@@ -416,48 +389,21 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
-// Interpolate between two images using specified amount of interpolation
+// Interpolate between two ARGB images using specified amount of interpolation
// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
- const uint8* src1, int src_stride1,
- uint8* dst, int dst_stride,
- int width, int height, int interpolation);
-
-// Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
-LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
- const uint8* src0_u, int src0_stride_u,
- const uint8* src0_v, int src0_stride_v,
- const uint8* src1_y, int src1_stride_y,
- const uint8* src1_u, int src1_stride_u,
- const uint8* src1_v, int src1_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height, int interpolation);
-
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
--- a/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/third_party/libyuv/include/libyuv/rotate_row.h
@@ -22,24 +22,53 @@
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+ defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".private_extern _" #name " \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#else
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+#name ": \n"
#endif
#endif
-// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+// The following are available for Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
+// The following are available for GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
+// The following are available for 32 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
// The following are available for 64 bit GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
@@ -56,8 +85,8 @@
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_DSPR2
-#define HAS_TRANSPOSEUVWX8_DSPR2
+#define HAS_TRANSPOSEWX8_MIPS_DSPR2
+#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
@@ -71,9 +100,7 @@
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
@@ -82,8 +109,8 @@
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
@@ -99,19 +126,9 @@
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b, int width);
#ifdef __cplusplus
} // extern "C"
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -41,12 +41,6 @@
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
// True if compiling for SSSE3 as a requirement.
#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
#define LIBYUV_SSSE3_ONLY
@@ -62,26 +56,6 @@
#endif // clang >= 3.5
#endif // __clang__
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif // GNUC >= 4.7
-#endif // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif // clang >= 3.4
-#endif // __clang__
-
-// Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
- defined(_MSC_VER) && _MSC_VER >= 1700
-#define VISUALC_HAS_AVX2 1
-#endif // VisualStudio >= 2012
-
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
@@ -97,23 +71,25 @@
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
-#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
-#define HAS_ARGBEXTRACTALPHAROW_SSE2
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
-#define HAS_H422TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
#define HAS_I422TORGB24ROW_SSSE3
#define HAS_I422TORGB565ROW_SSSE3
#define HAS_I422TORGBAROW_SSSE3
@@ -123,13 +99,15 @@
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
#define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RAWTORGB24ROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
@@ -167,9 +145,9 @@
#define HAS_ARGBSHADEROW_SSE2
#define HAS_ARGBSUBTRACTROW_SSE2
#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_BLENDPLANEROW_SSSE3
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
#define HAS_INTERPOLATEROW_SSSE3
#define HAS_RGBCOLORTABLEROW_X86
#define HAS_SOBELROW_SSE2
@@ -177,18 +155,54 @@
#define HAS_SOBELXROW_SSE2
#define HAS_SOBELXYROW_SSE2
#define HAS_SOBELYROW_SSE2
+#endif
-// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
-// caveat: clangcl uses row_win.cc which works.
-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
- !defined(__i386__) || defined(_MSC_VER)
-// TODO(fbarchard): fix build error on x86 debug
-// https://code.google.com/p/libyuv/issues/detail?id=524
-#define HAS_I411TOARGBROW_SSSE3
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_SSSE3
+// The following are available on x64 Visual C and clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+ (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422TOARGBROW_SSSE3
#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+ defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available require VS2012. Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available on all x86 platforms, but
@@ -201,34 +215,21 @@
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBPOLYNOMIALROW_AVX2
#define HAS_ARGBSHUFFLEROW_AVX2
-#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_ARGBTOUVJROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
-#define HAS_H422TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
-#if !(defined(_DEBUG) && defined(__i386__))
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#endif
-#define HAS_I411TOARGBROW_AVX2
-#define HAS_I422TOARGB1555ROW_AVX2
-#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORAWROW_AVX2
#define HAS_I422TORGB24ROW_AVX2
-#define HAS_I422TORGB565ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
-#define HAS_I444TOARGBROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
-#define HAS_NV12TOARGBROW_AVX2
-#define HAS_NV12TORGB565ROW_AVX2
-#define HAS_NV21TOARGBROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
@@ -245,29 +246,17 @@
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
-#define HAS_BLENDPLANEROW_AVX2
#endif
-// The following are available for AVX2 Visual C and clangcl 32 bit:
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
- (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
-#define HAS_ARGBTOARGB1555ROW_AVX2
-#define HAS_ARGBTOARGB4444ROW_AVX2
-#define HAS_ARGBTORGB565ROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+ !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
#endif
-// The following are also available on x64 Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
- (!defined(__clang__) || defined(__SSSE3__))
-#define HAS_I422ALPHATOARGBROW_SSSE3
-#define HAS_I422TOARGBROW_SSSE3
-#endif
-
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -279,29 +268,29 @@
#define HAS_ARGB4444TOARGBROW_NEON
#define HAS_ARGB4444TOUVROW_NEON
#define HAS_ARGB4444TOYROW_NEON
-#define HAS_ARGBSETROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORGB565DITHERROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBEXTRACTALPHAROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
-#define HAS_I400TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON
-#define HAS_I422ALPHATOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
#define HAS_I422TOARGB4444ROW_NEON
#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON
#define HAS_I422TORGB565ROW_NEON
#define HAS_I422TORGBAROW_NEON
@@ -308,7 +297,6 @@
#define HAS_I422TOUYVYROW_NEON
#define HAS_I422TOYUY2ROW_NEON
#define HAS_I444TOARGBROW_NEON
-#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
@@ -315,8 +303,8 @@
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB565ROW_NEON
#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
#define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGB24ROW_NEON
#define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
@@ -328,21 +316,23 @@
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
+#define HAS_I400TOARGBROW_NEON
#define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
-#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBMIRRORROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
@@ -349,7 +339,6 @@
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBSHADEROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_INTERPOLATEROW_NEON
#define HAS_SOBELROW_NEON
@@ -357,6 +346,8 @@
#define HAS_SOBELXROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
#endif
// The following are available on Mips platforms:
@@ -364,15 +355,17 @@
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOARGBROW_DSPR2
-#define HAS_INTERPOLATEROW_DSPR2
-#define HAS_MIRRORROW_DSPR2
-#define HAS_MIRRORUVROW_DSPR2
-#define HAS_SPLITUVROW_DSPR2
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
#endif
#endif
-#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__CLR_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#define SIMD_ALIGNED32(var) __declspec(align(64)) var
typedef __declspec(align(16)) int16 vec16[8];
@@ -387,7 +380,7 @@
typedef __declspec(align(32)) uint16 ulvec16[16];
typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32];
-#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
+#elif defined(__GNUC__)
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
@@ -420,56 +413,6 @@
typedef uint8 ulvec8[32];
#endif
-#if defined(__aarch64__)
-// This struct is for Arm64 color conversion.
-struct YuvConstants {
- uvec16 kUVToRB;
- uvec16 kUVToRB2;
- uvec16 kUVToG;
- uvec16 kUVToG2;
- vec16 kUVBiasBGR;
- vec32 kYToRgb;
-};
-#elif defined(__arm__)
-// This struct is for ArmV7 color conversion.
-struct YuvConstants {
- uvec8 kUVToRB;
- uvec8 kUVToG;
- vec16 kUVBiasBGR;
- vec32 kYToRgb;
-};
-#else
-// This struct is for Intel color conversion.
-struct YuvConstants {
- lvec8 kUVToB;
- lvec8 kUVToG;
- lvec8 kUVToR;
- lvec16 kUVBiasB;
- lvec16 kUVBiasG;
- lvec16 kUVBiasR;
- lvec16 kYToRgb;
-};
-
-// Offsets into YuvConstants structure
-#define KUVTOB 0
-#define KUVTOG 32
-#define KUVTOR 64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB 192
-#endif
-
-// Conversion matrix for YUV to RGB
-extern const struct YuvConstants kYuvI601Constants; // BT.601
-extern const struct YuvConstants kYuvJPEGConstants; // JPeg color space
-extern const struct YuvConstants kYuvH709Constants; // BT.709
-
-// Conversion matrix for YVU to BGR
-extern const struct YuvConstants kYvuI601Constants; // BT.601
-extern const struct YuvConstants kYvuJPEGConstants; // JPeg color space
-extern const struct YuvConstants kYvuH709Constants; // BT.709
-
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
#else
@@ -559,166 +502,159 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
+void I411ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
+void I422ToBGRARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+ uint8* dst_bgra,
int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width);
void NV21ToARGBRow_NEON(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_rgb565,
+ int width);
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
- int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
- int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -729,10 +665,6 @@
uint8* dst_u, uint8* dst_v, int width);
void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -744,31 +676,33 @@
void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
@@ -795,15 +729,25 @@
void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
void ARGBToUV444Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
void ARGBToUV411Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
void MirrorRow_C(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
@@ -814,9 +758,10 @@
int width);
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
@@ -826,23 +771,20 @@
void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width);
+ int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width);
@@ -874,26 +816,10 @@
void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
- int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
- int width);
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,
- int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,
- int width);
-
void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
- int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
- int width);
void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint8 v8, int count);
@@ -909,541 +835,524 @@
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width);
+ const uint8* shuffler, int pix);
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int width);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
+ int pix);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
- int width);
+ int pix);
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int width);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
- int width);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+ int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int width);
+ int pix);
void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int width);
+ int pix);
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
- int width);
+ int pix);
void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
- int width);
+ int pix);
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
- int width);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
- int width);
+ int pix);
void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int width);
+ int pix);
void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int width);
+ int pix);
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+ const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+ const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+ const uint32 dither4, int pix);
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422AlphaToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I411ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_C(const uint8* src_y,
- const uint8* src_uv,
+ const uint8* src_vu,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_C(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_C(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void J422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
void I422ToRGBARow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
void I422ToARGB4444Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
+void I422ToBGRARow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
+void I422ToABGRRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I411ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I411ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_AVX2(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void J422ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
void I422ToRGBARow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+void I422ToRAWRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToRGBARow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I411ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_vu,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void J422ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRAWRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
@@ -1456,23 +1365,13 @@
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
-// Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width);
-
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
@@ -1523,32 +1422,26 @@
void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width);
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
- int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
- int width);
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+ const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width);
+ const uint32 dither4, int pix);
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
- int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
- int width);
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
- int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
- int width);
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
@@ -1556,169 +1449,186 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
void I411ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToRGBARow_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_Any_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_vu,
+ const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width);
+ uint8* dst_u, uint8* dst_v, int pix);
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
@@ -1763,6 +1673,7 @@
// Effects related row functions.
void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
@@ -1842,6 +1753,9 @@
void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr,
int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
@@ -1851,12 +1765,15 @@
void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
@@ -1863,9 +1780,9 @@
void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride_ptr, int width,
- int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
ptrdiff_t src_stride_ptr,
--- a/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/third_party/libyuv/include/libyuv/scale_argb.h
@@ -35,6 +35,7 @@
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering);
+// TODO(fbarchard): Implement this.
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -23,27 +23,7 @@
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif // GNUC >= 4.7
-#endif // __GNUC__
-
-// clang >= 3.4.0 required for AVX2.
-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
-#define CLANG_HAS_AVX2 1
-#endif // clang >= 3.4
-#endif // __clang__
-
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
@@ -62,23 +42,24 @@
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
+#define HAS_SCALEROWDOWN4_SSE2
#endif
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
- defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+// The following are available on VS2012:
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
+#endif
+
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@@ -96,10 +77,10 @@
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
#endif
// Scale ARGB vertically with bilinear interpolation.
@@ -152,8 +133,6 @@
uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -235,12 +214,12 @@
int dst_width, int x, int dx);
// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -247,10 +226,10 @@
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -272,26 +251,22 @@
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -443,8 +418,6 @@
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -474,26 +447,28 @@
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
#ifdef __cplusplus
} // extern "C"
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1601
+#define LIBYUV_VERSION 1456
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
--- a/third_party/libyuv/include/libyuv/video_common.h
+++ b/third_party/libyuv/include/libyuv/video_common.h
@@ -62,7 +62,7 @@
// 2 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -90,8 +90,7 @@
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
- FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
- FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'),
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -151,7 +150,6 @@
FOURCC_BPP_YU12 = 12,
FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8,
- FOURCC_BPP_H420 = 12,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,
--- a/third_party/libyuv/source/compare.cc
+++ b/third_party/libyuv/source/compare.cc
@@ -17,7 +17,6 @@
#endif
#include "libyuv/basic_types.h"
-#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
@@ -28,12 +27,29 @@
#endif
// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
- uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
- HashDjb2_C;
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@@ -110,6 +126,23 @@
}
return fourcc;
}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
--- a/third_party/libyuv/source/compare_common.cc
+++ b/third_party/libyuv/source/compare_common.cc
@@ -10,8 +10,6 @@
#include "libyuv/basic_types.h"
-#include "libyuv/compare_row.h"
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
--- a/third_party/libyuv/source/compare_gcc.cc
+++ b/third_party/libyuv/source/compare_gcc.cc
@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -18,13 +16,11 @@
extern "C" {
#endif
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
- asm volatile (
+ asm volatile ( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
@@ -58,10 +54,15 @@
"+r"(count), // %2
"=g"(sse) // %3
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ ); // NOLINT
return sse;
}
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
@@ -90,7 +91,7 @@
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash;
- asm volatile (
+ asm volatile ( // NOLINT
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
@@ -139,7 +140,7 @@
"m"(kHashMul3) // %8
: "memory", "cc"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ ); // NOLINT
return hash;
}
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
--- a/third_party/libyuv/source/compare_neon.cc
+++ b/third_party/libyuv/source/compare_neon.cc
@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -29,6 +27,7 @@
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"
--- a/third_party/libyuv/source/compare_neon64.cc
+++ b/third_party/libyuv/source/compare_neon64.cc
@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -28,6 +26,7 @@
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
--- a/third_party/libyuv/source/compare_win.cc
+++ b/third_party/libyuv/source/compare_win.cc
@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
-
-#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -18,8 +16,9 @@
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@@ -101,26 +100,27 @@
}
#endif // _MSC_VER >= 1700
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-uvec32 kHashMul0 = {
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
-uvec32 kHashMul1 = {
+static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
-uvec32 kHashMul2 = {
+static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
-uvec32 kHashMul3 = {
+static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
@@ -127,6 +127,14 @@
0x00000001, // 33 ^ 0
};
+// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
+// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
+// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
+// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
+// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+ _asm _emit 0x40 _asm _emit reg
+
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
@@ -135,30 +143,30 @@
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
- movdqa xmm6, xmmword ptr kHash16x33
+ movdqa xmm6, kHash16x33
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
- pmulld xmm0, xmm6 // hash *= 33 ^ 16
- movdqa xmm5, xmmword ptr kHashMul0
+ pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
+ movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
- pmulld xmm3, xmm5
- movdqa xmm5, xmmword ptr kHashMul1
+ pmulld(0xdd) // pmulld xmm3, xmm5
+ movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
- pmulld xmm4, xmm5
- movdqa xmm5, xmmword ptr kHashMul2
+ pmulld(0xe5) // pmulld xmm4, xmm5
+ movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
- pmulld xmm2, xmm5
- movdqa xmm5, xmmword ptr kHashMul3
+ pmulld(0xd5) // pmulld xmm2, xmm5
+ movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
- pmulld xmm1, xmm5
+ pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
@@ -183,37 +191,36 @@
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
- vmovd xmm0, [esp + 12] // seed
+ movd xmm0, [esp + 12] // seed
+ movdqa xmm6, kHash16x33
wloop:
- vpmovzxbd xmm3, [eax] // src[0-3]
- vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
- vpmovzxbd xmm4, [eax + 4] // src[4-7]
- vpmulld xmm3, xmm3, xmmword ptr kHashMul0
- vpmovzxbd xmm2, [eax + 8] // src[8-11]
- vpmulld xmm4, xmm4, xmmword ptr kHashMul1
- vpmovzxbd xmm1, [eax + 12] // src[12-15]
- vpmulld xmm2, xmm2, xmmword ptr kHashMul2
+ vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
+ pmulld xmm3, kHashMul0
+ vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
+ pmulld xmm4, kHashMul1
+ vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
+ pmulld xmm2, kHashMul2
lea eax, [eax + 16]
- vpmulld xmm1, xmm1, xmmword ptr kHashMul3
- vpaddd xmm3, xmm3, xmm4 // add 16 results
- vpaddd xmm1, xmm1, xmm2
- vpaddd xmm1, xmm1, xmm3
- vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
- vpaddd xmm1, xmm1,xmm2
- vpshufd xmm2, xmm1, 0x01
- vpaddd xmm1, xmm1, xmm2
- vpaddd xmm0, xmm0, xmm1
+ pmulld xmm1, kHashMul3
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
sub ecx, 16
jg wloop
- vmovd eax, xmm0 // return hash
- vzeroupper
+ movd eax, xmm0 // return hash
ret
}
}
#endif // _MSC_VER >= 1700
-
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc
@@ -245,8 +245,8 @@
int y;
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) = SplitUVRow_C;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUVRow_C;
if (!src_y || !src_uv ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -303,14 +303,14 @@
}
}
#endif
-#if defined(HAS_SPLITUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_Any_DSPR2;
+ SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
- SplitUVRow = SplitUVRow_DSPR2;
+ SplitUVRow = SplitUVRow_MIPS_DSPR2;
}
}
#endif
@@ -390,9 +390,9 @@
int width, int height) {
int y;
void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
void (*YUY2ToYRow)(const uint8* src_yuy2,
- uint8* dst_y, int width) = YUY2ToYRow_C;
+ uint8* dst_y, int pix) = YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -455,9 +455,9 @@
int width, int height) {
int y;
void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
+ uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int width) = UYVYToYRow_C;
+ uint8* dst_y, int pix) = UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -521,7 +521,7 @@
int y;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
@@ -597,7 +597,7 @@
int y;
void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
- void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
BGRAToYRow_C;
if (!src_bgra ||
!dst_y || !dst_u || !dst_v ||
@@ -663,7 +663,7 @@
int y;
void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
- void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
ABGRToYRow_C;
if (!src_abgr ||
!dst_y || !dst_u || !dst_v ||
@@ -729,7 +729,7 @@
int y;
void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
- void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
RGBAToYRow_C;
if (!src_rgba ||
!dst_y || !dst_u || !dst_v ||
@@ -796,14 +796,14 @@
#if defined(HAS_RGB24TOYROW_NEON)
void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
- void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+ void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
RGB24ToYRow_C;
#else
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB24ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
@@ -910,14 +910,14 @@
#if defined(HAS_RAWTOYROW_NEON)
void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
- void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+ void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
RAWToYRow_C;
#else
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RAWToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_raw || !dst_y || !dst_u || !dst_v ||
@@ -1024,14 +1024,14 @@
#if defined(HAS_RGB565TOYROW_NEON)
void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
- void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+ void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
RGB565ToYRow_C;
#else
- void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB565ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
@@ -1146,14 +1146,14 @@
#if defined(HAS_ARGB1555TOYROW_NEON)
void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
- void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
+ void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
ARGB1555ToYRow_C;
#else
- void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
ARGB1555ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
@@ -1270,14 +1270,14 @@
#if defined(HAS_ARGB4444TOYROW_NEON)
void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
- void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
+ void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
ARGB4444ToYRow_C;
#else
- void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
ARGB4444ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
--- a/third_party/libyuv/source/convert_argb.cc
+++ b/third_party/libyuv/source/convert_argb.cc
@@ -14,7 +14,6 @@
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
-#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
#include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
@@ -45,21 +44,21 @@
return 0;
}
-// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf,
+ void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGBRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb ||
+ int width) = I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
@@ -69,155 +68,62 @@
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
-#if defined(HAS_I422TOARGBROW_SSSE3)
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u == width &&
+ src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
+#if defined(HAS_I444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
+ I444ToARGBRow = I444ToARGBRow_AVX2;
}
}
#endif
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
+ I444ToARGBRow = I444ToARGBRow_NEON;
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
- }
-#endif
for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
+ src_u += src_stride_u;
+ src_v += src_stride_v;
}
return 0;
}
-// Convert I420 to ARGB.
+// Convert I422 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
+int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
-}
-
-// Convert I420 to ABGR.
-LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J420 to ARGB.
-LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
-}
-
-// Convert J420 to ABGR.
-LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuJPEGConstants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to ARGB.
-LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvH709Constants,
- width, height);
-}
-
-// Convert H420 to ABGR.
-LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I420ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
@@ -263,18 +169,18 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -283,210 +189,6 @@
return 0;
}
-// Convert I422 to ARGB.
-LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
-}
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J422 to ARGB.
-LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
-}
-
-// Convert J422 to ABGR.
-LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuJPEGConstants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H422 to ARGB.
-LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvH709Constants,
- width, height);
-}
-
-// Convert H422 to ABGR.
-LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I422ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
- int y;
- void (*I444ToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I444ToARGBRow_C;
- if (!src_y || !src_u || !src_v ||
- !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
- // Coalesce rows.
- if (src_stride_y == width &&
- src_stride_u == width &&
- src_stride_v == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I444TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I444ToARGBRow = I444ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I444ToARGBRow = I444ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I444ToARGBRow = I444ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
- dst_argb += dst_stride_argb;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I444 to ARGB.
-LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height);
-}
-
-// Convert I444 to ABGR.
-LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert J444 to ARGB.
-LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- return I444ToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_argb, dst_stride_argb,
- &kYuvJPEGConstants,
- width, height);
-}
-
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -499,7 +201,6 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I411ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
@@ -547,7 +248,7 @@
#endif
for (y = 0; y < height; ++y) {
- I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
+ I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -556,143 +257,6 @@
return 0;
}
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width, int height, int attenuate) {
- int y;
- void (*I422AlphaToARGBRow)(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) = I422AlphaToARGBRow_C;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
- int width) = ARGBAttenuateRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 4)) {
- ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBATTENUATEROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBAttenuateRow = ARGBAttenuateRow_NEON;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
- width);
- if (attenuate) {
- ARGBAttenuateRow(dst_argb, dst_argb, width);
- }
- dst_argb += dst_stride_argb;
- src_a += src_stride_a;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 with Alpha to ARGB.
-LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int attenuate) {
- return I420AlphaToARGBMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- src_a, src_stride_a,
- dst_argb, dst_stride_argb,
- &kYuvI601Constants,
- width, height, attenuate);
-}
-
-// Convert I420 with Alpha to ABGR.
-LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- const uint8* src_a, int src_stride_a,
- uint8* dst_abgr, int dst_stride_abgr,
- int width, int height, int attenuate) {
- return I420AlphaToARGBMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- src_a, src_stride_a,
- dst_abgr, dst_stride_abgr,
- &kYvuI601Constants, // Use Yvu matrix
- width, height, attenuate);
-}
-
// Convert I400 to ARGB.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
@@ -758,7 +322,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
J400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
@@ -885,7 +449,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB24ToARGBRow_C;
if (!src_rgb24 || !dst_argb ||
width <= 0 || height == 0) {
@@ -935,7 +499,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RAWToARGBRow_C;
if (!src_raw || !dst_argb ||
width <= 0 || height == 0) {
@@ -985,7 +549,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
RGB565ToARGBRow_C;
if (!src_rgb565 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1044,7 +608,7 @@
int width, int height) {
int y;
void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
- int width) = ARGB1555ToARGBRow_C;
+ int pix) = ARGB1555ToARGBRow_C;
if (!src_argb1555 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -1102,7 +666,7 @@
int width, int height) {
int y;
void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
- int width) = ARGB4444ToARGBRow_C;
+ int pix) = ARGB4444ToARGBRow_C;
if (!src_argb4444 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -1163,7 +727,6 @@
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
@@ -1201,7 +764,7 @@
#endif
for (y = 0; y < height; ++y) {
- NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+ NV12ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -1221,7 +784,6 @@
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = NV21ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
@@ -1259,7 +821,7 @@
#endif
for (y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+ NV21ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -1278,7 +840,6 @@
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C;
if (!src_m420 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1316,16 +877,14 @@
#endif
for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, &kYuvI601Constants, width);
+ dst_argb + dst_stride_argb, width);
dst_argb += dst_stride_argb * 2;
src_m420 += src_stride_m420 * 3;
}
if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
}
return 0;
}
@@ -1336,10 +895,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*YUY2ToARGBRow)(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) =
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
YUY2ToARGBRow_C;
if (!src_yuy2 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1383,7 +939,7 @@
}
#endif
for (y = 0; y < height; ++y) {
- YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+ YUY2ToARGBRow(src_yuy2, dst_argb, width);
src_yuy2 += src_stride_yuy2;
dst_argb += dst_stride_argb;
}
@@ -1396,10 +952,7 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
- void (*UYVYToARGBRow)(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) =
+ void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
UYVYToARGBRow_C;
if (!src_uyvy || !dst_argb ||
width <= 0 || height == 0) {
@@ -1443,9 +996,155 @@
}
#endif
for (y = 0; y < height; ++y) {
- UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+ UYVYToARGBRow(src_uyvy, dst_argb, width);
src_uyvy += src_stride_uyvy;
dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*J422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = J422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J422ToARGBRow = J422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J422ToARGBRow = J422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*J422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = J422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J422ToARGBRow = J422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J422ToARGBRow = J422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J422ToARGBRow = J422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
}
return 0;
}
--- a/third_party/libyuv/source/convert_from.cc
+++ b/third_party/libyuv/source/convert_from.cc
@@ -445,26 +445,25 @@
return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
- dst_y, dst_stride_y,
+ dst_y, src_stride_y,
dst_vu, dst_stride_vu,
width, height);
}
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
+ void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba ||
+ int width) = I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
@@ -471,46 +470,46 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
-#if defined(HAS_I422TORGBAROW_SSSE3)
+#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
+#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
+ I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_NEON)
+#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
+ I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
- I422ToRGBARow = I422ToRGBARow_DSPR2;
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
@@ -520,49 +519,207 @@
return 0;
}
-// Convert I420 to RGBA.
+// Convert I420 to BGRA.
LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
+ uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgba, dst_stride_rgba,
- &kYuvI601Constants,
- width, height);
+ int y;
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+ if (!src_y || !src_u || !src_v || !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
}
-// Convert I420 to BGRA.
+// Convert I420 to ABGR.
LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
+int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
+ uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
+ int y;
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+ if (!src_y || !src_u || !src_v || !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
}
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgb24, int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ int y;
void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 ||
width <= 0 || height == 0) {
@@ -600,7 +757,7 @@
#endif
for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
dst_rgb24 += dst_stride_rgb24;
src_y += src_stride_y;
if (y & 1) {
@@ -611,34 +768,64 @@
return 0;
}
-// Convert I420 to RGB24.
+// Convert I420 to RAW.
LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
+int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_rgb24, int dst_stride_rgb24,
+ uint8* dst_raw, int dst_stride_raw,
int width, int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants,
- width, height);
-}
+ int y;
+ void (*I422ToRAWRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRAWRow_C;
+ if (!src_y || !src_u || !src_v || !dst_raw ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+ dst_stride_raw = -dst_stride_raw;
+ }
+#if defined(HAS_I422TORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRAWRow = I422ToRAWRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRAWRow = I422ToRAWRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRAWRow = I422ToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_NEON;
+ }
+ }
+#endif
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_raw, int dst_stride_raw,
- int width, int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_raw, dst_stride_raw,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
+ for (y = 0; y < height; ++y) {
+ I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+ dst_raw += dst_stride_raw;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
}
// Convert I420 to ARGB1555.
@@ -653,7 +840,6 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C;
if (!src_y || !src_u || !src_v || !dst_argb1555 ||
width <= 0 || height == 0) {
@@ -691,8 +877,7 @@
#endif
for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
- width);
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
dst_argb1555 += dst_stride_argb1555;
src_y += src_stride_y;
if (y & 1) {
@@ -716,7 +901,6 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C;
if (!src_y || !src_u || !src_v || !dst_argb4444 ||
width <= 0 || height == 0) {
@@ -754,8 +938,7 @@
#endif
for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
- width);
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
dst_argb4444 += dst_stride_argb4444;
src_y += src_stride_y;
if (y & 1) {
@@ -778,7 +961,6 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
@@ -816,7 +998,7 @@
#endif
for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -847,10 +1029,9 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+ const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
@@ -888,12 +1069,12 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@@ -924,7 +1105,7 @@
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
dst_rgb565 += dst_stride_rgb565;
@@ -1077,6 +1258,7 @@
// Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:
+ case FOURCC_YU12:
case FOURCC_YV12: {
int halfwidth = (width + 1) / 2;
int halfheight = (height + 1) / 2;
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -28,10 +28,10 @@
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) = ARGBToUV444Row_C;
+ int pix) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
@@ -109,16 +109,13 @@
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- if (!src_argb ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
- // Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -133,22 +130,34 @@
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -161,17 +170,9 @@
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
- ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToUV422Row(src_argb, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_y += dst_stride_y;
@@ -190,8 +191,8 @@
int width, int height) {
int y;
void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) = ARGBToUV411Row_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ int pix) = ARGBToUV411Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -263,7 +264,7 @@
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
@@ -372,7 +373,7 @@
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
@@ -477,9 +478,9 @@
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
@@ -501,22 +502,34 @@
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -529,14 +542,7 @@
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
+
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -561,7 +567,7 @@
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width);
I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
src_argb += src_stride_argb;
@@ -579,9 +585,9 @@
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
@@ -603,22 +609,34 @@
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -631,14 +649,7 @@
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
- }
-#endif
+
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -663,7 +674,7 @@
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width);
I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
src_argb += src_stride_argb;
@@ -681,7 +692,7 @@
uint8* dst_y, int dst_stride_y,
int width, int height) {
int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb || !dst_y || width <= 0 || height == 0) {
return -1;
@@ -753,7 +764,7 @@
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
int y;
- void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB24Row_C;
if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
@@ -801,7 +812,7 @@
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
int y;
- void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRAWRow_C;
if (!src_argb || !dst_raw || width <= 0 || height == 0) {
return -1;
@@ -858,7 +869,7 @@
const uint8* dither4x4, int width, int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+ const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@@ -910,7 +921,7 @@
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
int y;
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -966,7 +977,7 @@
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height) {
int y;
- void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB1555Row_C;
if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
return -1;
@@ -1022,7 +1033,7 @@
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height) {
int y;
- void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB4444Row_C;
if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
return -1;
@@ -1082,7 +1093,7 @@
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
@@ -1146,24 +1157,21 @@
return 0;
}
-// Convert ARGB to J422. (JPeg full range I422).
+// ARGB little endian (bgra in memory) to J422
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
- void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUVJ422Row_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYJRow_C;
- if (!src_argb ||
- !dst_yj || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
- // Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -1171,19 +1179,34 @@
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
- dst_stride_yj == width &&
+ dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
width *= height;
height = 1;
- src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
@@ -1204,20 +1227,12 @@
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_NEON;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
- ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
- ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
- dst_yj += dst_stride_yj;
+ dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
@@ -1230,7 +1245,7 @@
uint8* dst_yj, int dst_stride_yj,
int width, int height) {
int y;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
if (!src_argb || !dst_yj || width <= 0 || height == 0) {
return -1;
--- a/third_party/libyuv/source/convert_jpeg.cc
+++ b/third_party/libyuv/source/convert_jpeg.cc
@@ -9,7 +9,6 @@
*/
#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
--- a/third_party/libyuv/source/convert_to_argb.cc
+++ b/third_party/libyuv/source/convert_to_argb.cc
@@ -23,7 +23,7 @@
extern "C" {
#endif
-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
// src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
@@ -51,8 +51,8 @@
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
- uint8* dest_argb = crop_argb;
- int dest_argb_stride = argb_stride;
+ uint8* tmp_argb = crop_argb;
+ int tmp_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
@@ -66,13 +66,13 @@
}
if (need_buf) {
- int argb_size = crop_width * 4 * abs_crop_height;
+ int argb_size = crop_width * abs_crop_height * 4;
rotate_buffer = (uint8*)malloc(argb_size);
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
crop_argb = rotate_buffer;
- argb_stride = crop_width * 4;
+ argb_stride = crop_width;
}
switch (format) {
@@ -176,6 +176,7 @@
break;
// Triplanar formats
case FOURCC_I420:
+ case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
@@ -290,7 +291,7 @@
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
- dest_argb, dest_argb_stride,
+ tmp_argb, tmp_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
--- a/third_party/libyuv/source/convert_to_i420.cc
+++ b/third_party/libyuv/source/convert_to_i420.cc
@@ -39,13 +39,12 @@
int aligned_src_width = (src_width + 1) & ~1;
const uint8* src;
const uint8* src_uv;
- const int abs_src_height = (src_height < 0) ? -src_height : src_height;
- // TODO(nisse): Why allow crop_height < 0?
- const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
- format != FOURCC_YV12) || y == sample;
+ format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@@ -53,8 +52,7 @@
int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride;
uint8* rotate_buffer = NULL;
- const int inv_crop_height =
- (src_height < 0) ? -abs_crop_height : abs_crop_height;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
@@ -61,6 +59,9 @@
src_height == 0 || crop_height == 0) {
return -1;
}
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
@@ -213,6 +214,7 @@
break;
// Triplanar formats
case FOURCC_I420:
+ case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -10,12 +10,12 @@
#include "libyuv/cpu_id.h"
-#if defined(_MSC_VER)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
- defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
@@ -36,8 +36,7 @@
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
- !defined(__clang__)
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
@@ -49,9 +48,9 @@
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
// Visual C version uses intrinsic or inline x86 assembly.
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
@@ -64,7 +63,7 @@
mov [edi + 8], ecx
mov [edi + 12], edx
}
-#else // Visual C but not x86
+#else
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
@@ -72,9 +71,9 @@
}
#endif
// GCC version uses inline x86 assembly.
-#else // defined(_MSC_VER)
+#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
uint32 info_ebx, info_edx;
- asm volatile (
+ asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
@@ -90,7 +89,7 @@
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
cpu_info[3] = info_edx;
-#endif // defined(_MSC_VER)
+#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
@@ -99,37 +98,28 @@
}
#endif
-// For VS2010 and earlier emit can be used:
-// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
-// __asm {
-// xor ecx, ecx // xcr 0
-// xgetbv
-// mov xcr0, eax
-// }
-// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
-// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", off)
-#endif
+// TODO(fbarchard): Enable xgetbv when validator supports it.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-int GetXCR0() {
+int TestOsSaveYmm() {
uint32 xcr0 = 0u;
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
+#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
+ __asm {
+ xor ecx, ecx // xcr 0
+ _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+ mov xcr0, eax
+ }
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
- return xcr0;
+ return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
-// Return optimization to previous setting.
-#if defined(_M_IX86) && (_MSC_VER < 1900)
-#pragma optimize("g", on)
-#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
@@ -161,9 +151,30 @@
return 0;
}
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+ char cpuinfo_line[512];
+ const char* file_name = "/proc/cpuinfo";
+ FILE* f = fopen(file_name, "r");
+ if (!f) {
+ // Assume DSP if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasMIPS_DSP;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ return kCpuHasMIPS_DSP;
+ }
+ }
+ fclose(f);
+ return 0;
+}
+#endif
+
// CPU detect function for SIMD instruction sets.
LIBYUV_API
-int cpu_info_ = 0; // cpu_info is not initialized yet.
+int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
@@ -186,9 +197,8 @@
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
- // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
- int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
@@ -197,67 +207,67 @@
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
- cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
- ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
- ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
- ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
- ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
- ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
- kCpuHasX86;
+ cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ kCpuHasX86;
#ifdef HAS_XGETBV
- // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
- if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
- ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
- cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
-
- // Detect AVX512bw
- if ((GetXCR0() & 0xe0) == 0xe0) {
- cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
- }
+ if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
+ TestOsSaveYmm()) { // Saves YMM.
+ cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ kCpuHasAVX;
}
#endif
-
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
- cpu_info &= ~kCpuHasX86;
+ cpu_info_ &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
- cpu_info &= ~kCpuHasSSE2;
+ cpu_info_ &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
- cpu_info &= ~kCpuHasSSSE3;
+ cpu_info_ &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
- cpu_info &= ~kCpuHasSSE41;
+ cpu_info_ &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
- cpu_info &= ~kCpuHasSSE42;
+ cpu_info_ &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
- cpu_info &= ~kCpuHasAVX;
+ cpu_info_ &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
- cpu_info &= ~kCpuHasAVX2;
+ cpu_info_ &= ~kCpuHasAVX2;
}
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
- cpu_info &= ~kCpuHasERMS;
+ cpu_info_ &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
- cpu_info &= ~kCpuHasFMA3;
+ cpu_info_ &= ~kCpuHasFMA3;
}
- if (TestEnv("LIBYUV_DISABLE_AVX3")) {
- cpu_info &= ~kCpuHasAVX3;
- }
#endif
#if defined(__mips__) && defined(__linux__)
+ // Linux mips parse text file for dsp detect.
+ cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
- cpu_info |= kCpuHasDSPR2;
+ cpu_info_ |= kCpuHasMIPS_DSPR2;
#endif
- cpu_info |= kCpuHasMIPS;
- if (getenv("LIBYUV_DISABLE_DSPR2")) {
- cpu_info &= ~kCpuHasDSPR2;
+ cpu_info_ |= kCpuHasMIPS;
+
+ if (getenv("LIBYUV_DISABLE_MIPS")) {
+ cpu_info_ &= ~kCpuHasMIPS;
}
+ if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSP;
+ }
+ if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+ }
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
@@ -264,31 +274,28 @@
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
- cpu_info = kCpuHasNEON;
+ cpu_info_ = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#endif
#if defined(__aarch64__)
- cpu_info = kCpuHasNEON;
+ cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
- cpu_info = ArmCpuCaps("/proc/cpuinfo");
+ cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#endif
- cpu_info |= kCpuHasARM;
+ cpu_info_ |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) {
- cpu_info &= ~kCpuHasNEON;
+ cpu_info_ &= ~kCpuHasNEON;
}
#endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) {
- cpu_info = 0;
+ cpu_info_ = 0;
}
- cpu_info |= kCpuInitialized;
- cpu_info_ = cpu_info;
- return cpu_info;
+ return cpu_info_;
}
-// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;
--- a/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -59,7 +59,8 @@
// Methods that are passed to jpeglib.
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
void init_source(jpeg_decompress_struct* cinfo);
-void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
+void skip_input_data(jpeg_decompress_struct* cinfo,
+ long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
@@ -428,7 +429,8 @@
return TRUE;
}
-void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
+void skip_input_data(j_decompress_ptr cinfo,
+ long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes;
}
--- a/third_party/libyuv/source/mjpeg_validate.cc
+++ b/third_party/libyuv/source/mjpeg_validate.cc
@@ -17,22 +17,51 @@
extern "C" {
#endif
-// Helper function to scan for EOI marker (0xff 0xd9).
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked)
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // src
+ mov eax, [esp + 8] // val
+ mov ecx, [esp + 12] // count
+ repne scasb
+ jne sr99
+ mov eax, edi
+ sub eax, 1
+ mov edi, edx
+ ret
+
+ sr99:
+ mov eax, 0
+ mov edi, edx
+ ret
+ }
+}
+#endif
+
+// Helper function to scan for EOI marker.
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
- if (sample_size >= 2) {
- const uint8* end = sample + sample_size - 1;
- const uint8* it = sample;
- while (it < end) {
- // TODO(fbarchard): scan for 0xd9 instead.
- it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
- if (it == NULL) {
- break;
- }
- if (it[1] == 0xd9) {
- return LIBYUV_TRUE; // Success: Valid jpeg.
- }
- ++it; // Skip over current 0xff.
+ const uint8* end = sample + sample_size - 1;
+ const uint8* it = sample;
+ for (;;) {
+#ifdef ENABLE_SCASB
+ it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+ it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+ if (it == NULL) {
+ break;
}
+ if (it[1] == 0xd9) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ ++it; // Skip over current 0xff.
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
@@ -40,19 +69,20 @@
// Helper function to validate the jpeg appears intact.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
- // Maximum size that ValidateJpeg will consider valid.
- const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
- if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+ if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
}
- if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
+ if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
+ // Step over SOI marker.
+ sample += 2;
+ sample_size -= 2;
- // Look for the End Of Image (EOI) marker near the end of the buffer.
+ // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
@@ -60,8 +90,8 @@
// Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1;
}
- // Step over SOI marker and scan for EOI.
- return ScanEOI(sample + 2, sample_size - 2);
+ return ScanEOI(sample, sample_size);
+
}
#ifdef __cplusplus
--- a/third_party/libyuv/source/planar_functions.cc
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -17,7 +17,6 @@
#include "libyuv/mjpeg_decoder.h"
#endif
#include "libyuv/row.h"
-#include "libyuv/scale_row.h" // for ScaleRowDown2
#ifdef __cplusplus
namespace libyuv {
@@ -238,6 +237,14 @@
}
}
#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MirrorRow = MirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
@@ -255,11 +262,11 @@
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
- MirrorRow = MirrorRow_DSPR2;
+ MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
@@ -280,9 +287,9 @@
int width, int height) {
int y;
void (*YUY2ToUV422Row)(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) =
+ uint8* dst_u, uint8* dst_v, int pix) =
YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
@@ -352,10 +359,10 @@
int width, int height) {
int y;
void (*UYVYToUV422Row)(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) =
+ uint8* dst_u, uint8* dst_v, int pix) =
UYVYToUV422Row_C;
void (*UYVYToYRow)(const uint8* src_uyvy,
- uint8* dst_y, int width) = UYVYToYRow_C;
+ uint8* dst_y, int pix) = UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -534,6 +541,11 @@
return ARGBBlendRow;
}
#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBBlendRow = ARGBBlendRow_SSE2;
+ }
+#endif
#if defined(HAS_ARGBBLENDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBBlendRow = ARGBBlendRow_NEON;
@@ -578,179 +590,6 @@
return 0;
}
-// Alpha Blend plane and store to destination.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- int width, int height) {
- int y;
- void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
- if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_stride_y = -dst_stride_y;
- }
-
- // Coalesce rows for Y plane.
- if (src_stride_y0 == width &&
- src_stride_y1 == width &&
- alpha_stride == width &&
- dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
- }
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- BlendPlaneRow = BlendPlaneRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- BlendPlaneRow = BlendPlaneRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- BlendPlaneRow = BlendPlaneRow_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
- src_y0 += src_stride_y0;
- src_y1 += src_stride_y1;
- alpha += alpha_stride;
- dst_y += dst_stride_y;
- }
- return 0;
-}
-
-#define MAXTWIDTH 2048
-// Alpha Blend YUV images and store to destination.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
- const uint8* src_u0, int src_stride_u0,
- const uint8* src_v0, int src_stride_v0,
- const uint8* src_y1, int src_stride_y1,
- const uint8* src_u1, int src_stride_u1,
- const uint8* src_v1, int src_stride_v1,
- const uint8* alpha, int alpha_stride,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- int y;
- // Half width/height for UV.
- int halfwidth = (width + 1) >> 1;
- void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
- if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
- !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
-
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_stride_y = -dst_stride_y;
- }
-
- // Blend Y plane.
- BlendPlane(src_y0, src_stride_y0,
- src_y1, src_stride_y1,
- alpha, alpha_stride,
- dst_y, dst_stride_y,
- width, height);
-
-#if defined(HAS_BLENDPLANEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
- if (IS_ALIGNED(halfwidth, 8)) {
- BlendPlaneRow = BlendPlaneRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_BLENDPLANEROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- BlendPlaneRow = BlendPlaneRow_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- BlendPlaneRow = BlendPlaneRow_AVX2;
- }
- }
-#endif
- if (!IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
- }
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
- if (IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- ScaleRowDown2 = ScaleRowDown2Box_NEON;
- }
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
- if (IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
- if (IS_ALIGNED(halfwidth, 16)) {
- ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
- }
- }
- }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
- if (IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
- if (IS_ALIGNED(halfwidth, 32)) {
- ScaleRowDown2 = ScaleRowDown2Box_AVX2;
- }
- }
- }
-#endif
-
- // Row buffer for intermediate alpha pixels.
- align_buffer_64(halfalpha, halfwidth);
- for (y = 0; y < height; y += 2) {
- // last row of odd height image use 1 row of alpha instead of 2.
- if (y == (height - 1)) {
- alpha_stride = 0;
- }
- // Subsample 2 rows of UV to half width and half height.
- ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
- alpha += alpha_stride * 2;
- BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
- BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
- src_u0 += src_stride_u0;
- src_u1 += src_stride_u1;
- dst_u += dst_stride_u;
- src_v0 += src_stride_v0;
- src_v1 += src_stride_v1;
- dst_v += dst_stride_v;
- }
- free_aligned_buffer_64(halfalpha);
- return 0;
-}
-
// Multiply 2 ARGB images and store to destination.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@@ -938,21 +777,22 @@
}
return 0;
}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width, int height) {
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf,
+ void (*I422ToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba ||
+ int width) = I422ToBGRARow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_bgra ||
width <= 0 || height == 0) {
return -1;
}
@@ -959,46 +799,55 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
}
-#if defined(HAS_I422TORGBAROW_SSSE3)
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_bgra == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
+ }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
+#if defined(HAS_I422TOBGRAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
+ I422ToBGRARow = I422ToBGRARow_AVX2;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_NEON)
+#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
+ I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
- I422ToRGBARow = I422ToRGBARow_DSPR2;
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
@@ -1006,34 +855,140 @@
return 0;
}
-// Convert I422 to RGBA.
+// Convert I422 to ABGR.
LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
+int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_rgba, int dst_stride_rgba,
+ uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- dst_rgba, dst_stride_rgba,
- &kYuvI601Constants,
- width, height);
+ int y;
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
+ }
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
}
-// Convert I422 to BGRA.
+// Convert I422 to RGBA.
LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
- uint8* dst_bgra, int dst_stride_bgra,
+ uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y,
- src_v, src_stride_v, // Swap U and V
- src_u, src_stride_u,
- dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
+ }
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
}
// Convert NV12 to RGB565.
@@ -1046,7 +1001,6 @@
void (*NV12ToRGB565Row)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) = NV12ToRGB565Row_C;
if (!src_y || !src_uv || !dst_rgb565 ||
width <= 0 || height == 0) {
@@ -1084,7 +1038,7 @@
#endif
for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -1094,15 +1048,18 @@
return 0;
}
-// Convert RAW to RGB24.
+// Convert NV21 to RGB565.
LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height) {
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
int y;
- void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
- RAWToRGB24Row_C;
- if (!src_raw || !dst_rgb24 ||
+ void (*NV21ToRGB565Row)(const uint8* y_buf,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ int width) = NV21ToRGB565Row_C;
+ if (!src_y || !src_vu || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
@@ -1109,37 +1066,41 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
}
- // Coalesce rows.
- if (src_stride_raw == width * 3 &&
- dst_stride_rgb24 == width * 3) {
- width *= height;
- height = 1;
- src_stride_raw = dst_stride_rgb24 = 0;
- }
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+ NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
}
}
#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- RAWToRGB24Row = RAWToRGB24Row_NEON;
+ NV21ToRGB565Row = NV21ToRGB565Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
- RAWToRGB24Row(src_raw, dst_rgb24, width);
- src_raw += src_stride_raw;
- dst_rgb24 += dst_stride_rgb24;
+ NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
}
return 0;
}
@@ -1149,7 +1110,7 @@
int width, int height,
uint32 value) {
int y;
- void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+ void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1225,7 +1186,7 @@
int width, int height,
uint32 value) {
int y;
- void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
+ void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb ||
width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) {
@@ -1301,6 +1262,14 @@
height = 1;
src_stride_argb = dst_stride_argb = 0;
}
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -1855,37 +1824,45 @@
return 0;
}
-// Interpolate 2 planes by specified amount (0 to 255).
+// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
- const uint8* src1, int src_stride1,
- uint8* dst, int dst_stride,
- int width, int height, int interpolation) {
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation) {
int y;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- dst = dst + (height - 1) * dst_stride;
- dst_stride = -dst_stride;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
- if (src_stride0 == width &&
- src_stride1 == width &&
- dst_stride == width) {
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
width *= height;
height = 1;
- src_stride0 = src_stride1 = dst_stride = 0;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
@@ -1893,7 +1870,7 @@
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
+ if (IS_ALIGNED(width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
@@ -1901,77 +1878,30 @@
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
- IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
- IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
- IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
- IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+ IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
- InterpolateRow(dst, src0, src1 - src0, width, interpolation);
- src0 += src_stride0;
- src1 += src_stride1;
- dst += dst_stride;
+ InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+ width * 4, interpolation);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
}
return 0;
}
-// Interpolate 2 ARGB images by specified amount (0 to 255).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
- const uint8* src_argb1, int src_stride_argb1,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height, int interpolation) {
- return InterpolatePlane(src_argb0, src_stride_argb0,
- src_argb1, src_stride_argb1,
- dst_argb, dst_stride_argb,
- width * 4, height, interpolation);
-}
-
-// Interpolate 2 YUV images by specified amount (0 to 255).
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
- const uint8* src0_u, int src0_stride_u,
- const uint8* src0_v, int src0_stride_v,
- const uint8* src1_y, int src1_stride_y,
- const uint8* src1_u, int src1_stride_u,
- const uint8* src1_v, int src1_stride_v,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height, int interpolation) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src0_y || !src0_u || !src0_v ||
- !src1_y || !src1_u || !src1_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- InterpolatePlane(src0_y, src0_stride_y,
- src1_y, src1_stride_y,
- dst_y, dst_stride_y,
- width, height, interpolation);
- InterpolatePlane(src0_u, src0_stride_u,
- src1_u, src1_stride_u,
- dst_u, dst_stride_u,
- halfwidth, halfheight, interpolation);
- InterpolatePlane(src0_v, src0_stride_v,
- src1_v, src1_stride_v,
- dst_v, dst_stride_v,
- halfwidth, halfheight, interpolation);
- return 0;
-}
-
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
LIBYUV_API
int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
@@ -1979,7 +1909,7 @@
const uint8* shuffler, int width, int height) {
int y;
void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
- const uint8* shuffler, int width) = ARGBShuffleRow_C;
+ const uint8* shuffler, int pix) = ARGBShuffleRow_C;
if (!src_bgra || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -2046,7 +1976,7 @@
const uint8* src_sobely,
uint8* dst, int width)) {
int y;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
ARGBToYJRow_C;
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) = SobelYRow_C;
@@ -2350,19 +2280,13 @@
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
- }
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
- }
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
}
#endif
@@ -2374,49 +2298,6 @@
return 0;
}
-// Extract just the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
- uint8* dst_a, int dst_stride,
- int width, int height) {
- if (!src_argb || !dst_a || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_argb += (height - 1) * src_stride;
- src_stride = -src_stride;
- }
- // Coalesce rows.
- if (src_stride == width * 4 && dst_stride == width) {
- width *= height;
- height = 1;
- src_stride = dst_stride = 0;
- }
- void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
- ARGBExtractAlphaRow_C;
-#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
- : ARGBExtractAlphaRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
- : ARGBExtractAlphaRow_Any_NEON;
- }
-#endif
-
- for (int y = 0; y < height; ++y) {
- ARGBExtractAlphaRow(src_argb, dst_a, width);
- src_argb += src_stride;
- dst_a += dst_stride;
- }
- return 0;
-}
-
// Copy a planar Y channel to the alpha channel of a destination ARGB image.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
@@ -2442,19 +2323,13 @@
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
- }
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif
#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
- }
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
}
#endif
@@ -2466,9 +2341,6 @@
return 0;
}
-// TODO(fbarchard): Consider if width is even Y channel can be split
-// directly. A SplitUVRow_Odd function could copy the remaining chroma.
-
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
@@ -2476,8 +2348,8 @@
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) = SplitUVRow_C;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
@@ -2516,6 +2388,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2543,16 +2423,15 @@
{
int awidth = halfwidth * 2;
- // row of y and 2 rows of uv
- align_buffer_64(rows, awidth * 3);
+ // 2 rows of uv
+ align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
- SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
- memcpy(dst_y, rows, width);
- SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
- memcpy(dst_y + dst_stride_y, rows, width);
- InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ SplitUVRow(src_yuy2, dst_y, rows, awidth);
+ SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
+ rows + awidth, awidth);
+ InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_yuy2 += src_stride_yuy2 * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
@@ -2559,8 +2438,7 @@
}
if (height & 1) {
// Split Y from UV.
- SplitUVRow(src_yuy2, rows, dst_uv, awidth);
- memcpy(dst_y, rows, width);
+ SplitUVRow(src_yuy2, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
@@ -2574,8 +2452,8 @@
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) = SplitUVRow_C;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
@@ -2614,6 +2492,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2641,16 +2527,15 @@
{
int awidth = halfwidth * 2;
- // row of y and 2 rows of uv
- align_buffer_64(rows, awidth * 3);
+ // 2 rows of uv
+ align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
- SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
- memcpy(dst_y, rows, width);
- SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
- memcpy(dst_y + dst_stride_y, rows, width);
- InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ SplitUVRow(src_uyvy, rows, dst_y, awidth);
+ SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
+ dst_y + dst_stride_y, awidth);
+ InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_uyvy += src_stride_uyvy * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
@@ -2657,8 +2542,7 @@
}
if (height & 1) {
// Split Y from UV.
- SplitUVRow(src_uyvy, dst_uv, rows, awidth);
- memcpy(dst_y, rows, width);
+ SplitUVRow(src_uyvy, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
--- a/third_party/libyuv/source/rotate.cc
+++ b/third_party/libyuv/source/rotate.cc
@@ -49,13 +49,13 @@
}
}
#endif
-#if defined(HAS_TRANSPOSEWX8_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- TransposeWx8 = TransposeWx8_Fast_DSPR2;
+ TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
} else {
- TransposeWx8 = TransposeWx8_DSPR2;
+ TransposeWx8 = TransposeWx8_MIPS_DSPR2;
}
}
#endif
@@ -117,6 +117,14 @@
}
}
#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MirrorRow = MirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
@@ -134,11 +142,11 @@
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
- MirrorRow = MirrorRow_DSPR2;
+ MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_COPYROW_SSE2)
@@ -196,17 +204,14 @@
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx8 = TransposeUVWx8_SSE2;
- }
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
}
#endif
-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- TransposeUVWx8 = TransposeUVWx8_DSPR2;
+ TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
}
#endif
@@ -267,22 +272,22 @@
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
- void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_NEON;
+ MirrorRowUV = MirrorUVRow_NEON;
}
#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
+#if defined(HAS_MIRRORROW_UV_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- MirrorUVRow = MirrorUVRow_SSSE3;
+ MirrorRowUV = MirrorUVRow_SSSE3;
}
#endif
-#if defined(HAS_MIRRORUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
- MirrorUVRow = MirrorUVRow_DSPR2;
+ MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
}
#endif
@@ -290,7 +295,7 @@
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
- MirrorUVRow(src, dst_a, dst_b, width);
+ MirrorRowUV(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
--- a/third_party/libyuv/source/rotate_any.cc
+++ b/third_party/libyuv/source/rotate_any.cc
@@ -18,7 +18,7 @@
extern "C" {
#endif
-#define TANY(NAMEANY, TPOS_SIMD, MASK) \
+#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
@@ -26,48 +26,23 @@
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
- TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
+ TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
-TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_SSSE3
-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
#endif
-#ifdef HAS_TRANSPOSEWX8_DSPR2
-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
+TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
#endif
-#undef TANY
-#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
- void NAMEANY(const uint8* src, int src_stride, \
- uint8* dst_a, int dst_stride_a, \
- uint8* dst_b, int dst_stride_b, int width) { \
- int r = width & MASK; \
- int n = width - r; \
- if (n > 0) { \
- TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
- n); \
- } \
- TransposeUVWx8_C(src + n * 2, src_stride, \
- dst_a + n * dst_stride_a, dst_stride_a, \
- dst_b + n * dst_stride_b, dst_stride_b, r); \
- }
-
-#ifdef HAS_TRANSPOSEUVWX8_NEON
-TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_SSE2
-TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
-#endif
-#ifdef HAS_TRANSPOSEUVWX8_DSPR2
-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
-#endif
-#undef TUVANY
+#undef TANY
#ifdef __cplusplus
} // extern "C"
--- a/third_party/libyuv/source/rotate_gcc.cc
+++ b/third_party/libyuv/source/rotate_gcc.cc
@@ -17,17 +17,16 @@
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- LABELALIGN
+ ".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
@@ -106,260 +105,386 @@
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
-#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b, int width);
+ asm (
+ DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+ "push %ebx \n"
+ "push %esi \n"
+ "push %edi \n"
+ "push %ebp \n"
+ "mov 0x14(%esp),%eax \n"
+ "mov 0x18(%esp),%edi \n"
+ "mov 0x1c(%esp),%edx \n"
+ "mov 0x20(%esp),%esi \n"
+ "mov 0x24(%esp),%ebx \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov %esp,%ecx \n"
+ "sub $0x14,%esp \n"
+ "and $0xfffffff0,%esp \n"
+ "mov %ecx,0x10(%esp) \n"
+ "mov 0x2c(%ecx),%ecx \n"
+
+"1: \n"
+ "movdqu (%eax),%xmm0 \n"
+ "movdqu (%eax,%edi,1),%xmm1 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm0,%xmm7 \n"
+ "punpcklbw %xmm1,%xmm0 \n"
+ "punpckhbw %xmm1,%xmm7 \n"
+ "movdqa %xmm7,%xmm1 \n"
+ "movdqu (%eax),%xmm2 \n"
+ "movdqu (%eax,%edi,1),%xmm3 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm2,%xmm7 \n"
+ "punpcklbw %xmm3,%xmm2 \n"
+ "punpckhbw %xmm3,%xmm7 \n"
+ "movdqa %xmm7,%xmm3 \n"
+ "movdqu (%eax),%xmm4 \n"
+ "movdqu (%eax,%edi,1),%xmm5 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm4,%xmm7 \n"
+ "punpcklbw %xmm5,%xmm4 \n"
+ "punpckhbw %xmm5,%xmm7 \n"
+ "movdqa %xmm7,%xmm5 \n"
+ "movdqu (%eax),%xmm6 \n"
+ "movdqu (%eax,%edi,1),%xmm7 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqu %xmm5,(%esp) \n"
+ "neg %edi \n"
+ "movdqa %xmm6,%xmm5 \n"
+ "punpcklbw %xmm7,%xmm6 \n"
+ "punpckhbw %xmm7,%xmm5 \n"
+ "movdqa %xmm5,%xmm7 \n"
+ "lea 0x10(%eax,%edi,8),%eax \n"
+ "neg %edi \n"
+ "movdqa %xmm0,%xmm5 \n"
+ "punpcklwd %xmm2,%xmm0 \n"
+ "punpckhwd %xmm2,%xmm5 \n"
+ "movdqa %xmm5,%xmm2 \n"
+ "movdqa %xmm1,%xmm5 \n"
+ "punpcklwd %xmm3,%xmm1 \n"
+ "punpckhwd %xmm3,%xmm5 \n"
+ "movdqa %xmm5,%xmm3 \n"
+ "movdqa %xmm4,%xmm5 \n"
+ "punpcklwd %xmm6,%xmm4 \n"
+ "punpckhwd %xmm6,%xmm5 \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "movdqu (%esp),%xmm5 \n"
+ "movdqu %xmm6,(%esp) \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "punpcklwd %xmm7,%xmm5 \n"
+ "punpckhwd %xmm7,%xmm6 \n"
+ "movdqa %xmm6,%xmm7 \n"
+ "movdqa %xmm0,%xmm6 \n"
+ "punpckldq %xmm4,%xmm0 \n"
+ "punpckhdq %xmm4,%xmm6 \n"
+ "movdqa %xmm6,%xmm4 \n"
+ "movdqu (%esp),%xmm6 \n"
+ "movlpd %xmm0,(%edx) \n"
+ "movhpd %xmm0,(%ebx) \n"
+ "movlpd %xmm4,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm4,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm2,%xmm0 \n"
+ "punpckldq %xmm6,%xmm2 \n"
+ "movlpd %xmm2,(%edx) \n"
+ "movhpd %xmm2,(%ebx) \n"
+ "punpckhdq %xmm6,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm1,%xmm0 \n"
+ "punpckldq %xmm5,%xmm1 \n"
+ "movlpd %xmm1,(%edx) \n"
+ "movhpd %xmm1,(%ebx) \n"
+ "punpckhdq %xmm5,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm3,%xmm0 \n"
+ "punpckldq %xmm7,%xmm3 \n"
+ "movlpd %xmm3,(%edx) \n"
+ "movhpd %xmm3,(%ebx) \n"
+ "punpckhdq %xmm7,%xmm0 \n"
+ "sub $0x8,%ecx \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "jg 1b \n"
+ "mov 0x10(%esp),%esp \n"
+ "pop %ebp \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "pop %ebx \n"
+#if defined(__native_client__)
+ "pop %ecx \n"
+ "and $0xffffffe0,%ecx \n"
+ "jmp *%ecx \n"
+#else
+ "ret \n"
+#endif
+);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+ defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
- // Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(dst_stride)) // %4
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
- );
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 2 \n"
+"1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+);
}
-#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-// Transpose UV 8x8. 64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
- // Read in the data from the source pointer.
- // First round of bit swap.
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
- // Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
- // Third round of bit swap.
- // Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_a), // %1
- "+r"(dst_b), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(src_stride)), // %4
- "r"((intptr_t)(dst_stride_a)), // %5
- "r"((intptr_t)(dst_stride_b)) // %6
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
- "xmm8", "xmm9"
- );
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 2 \n"
+"1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
+);
}
-#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif
+#endif
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
--- a/third_party/libyuv/source/rotate_mips.cc
+++ b/third_party/libyuv/source/rotate_mips.cc
@@ -22,8 +22,8 @@
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -106,8 +106,8 @@
);
}
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
- uint8* dst, int dst_stride, int width) {
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set noat \n"
".set push \n"
@@ -308,10 +308,10 @@
);
}
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
- uint8* dst_a, int dst_stride_a,
- uint8* dst_b, int dst_stride_b,
- int width) {
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
--- a/third_party/libyuv/source/rotate_neon.cc
+++ b/third_party/libyuv/source/rotate_neon.cc
@@ -27,7 +27,7 @@
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
- const uint8* src_temp;
+ const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@@ -35,6 +35,7 @@
"sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
@@ -229,7 +230,7 @@
"4: \n"
- : "=&r"(src_temp), // %0
+ : "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
@@ -247,7 +248,7 @@
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
- const uint8* src_temp;
+ const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@@ -255,6 +256,7 @@
"sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
@@ -512,7 +514,7 @@
"4: \n"
- : "=&r"(src_temp), // %0
+ : "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
--- a/third_party/libyuv/source/rotate_neon64.cc
+++ b/third_party/libyuv/source/rotate_neon64.cc
@@ -26,7 +26,7 @@
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
- const uint8* src_temp;
+ const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
@@ -235,7 +235,7 @@
"4: \n"
- : "=&r"(src_temp), // %0
+ : "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width64) // %3
@@ -255,7 +255,7 @@
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
- const uint8* src_temp;
+ const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
@@ -520,7 +520,7 @@
"4: \n"
- : "=&r"(src_temp), // %0
+ : "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
--- a/third_party/libyuv/source/rotate_win.cc
+++ b/third_party/libyuv/source/rotate_win.cc
@@ -16,8 +16,9 @@
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -22,39 +22,6 @@
// Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- const uint8* a_buf, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 5]); \
- memset(temp, 0, 64 * 4); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 192, a_buf + n, r); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
- SS(r, DUVSHIFT) * BPP); \
- }
-
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
-ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
-#endif
-#ifdef HAS_I422ALPHATOARGBROW_NEON
-ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
-#endif
-#undef ANY41C
-
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
@@ -73,100 +40,83 @@
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
-#ifdef HAS_I422TOYUY2ROW_SSE2
-ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
-ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOYUY2ROW_NEON
-ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_I422TOUYVYROW_NEON
-ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
-#endif
-#ifdef HAS_BLENDPLANEROW_AVX2
-ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
-#endif
-#ifdef HAS_BLENDPLANEROW_SSSE3
-ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
-#endif
-#undef ANY31
-// Note that odd width replication includes 444 due to implementation
-// on arm that subsamples 444 to 422 internally.
-// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- if (width & 1) { \
- temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
- temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
- } \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
- }
-
#ifdef HAS_I422TOARGBROW_SSSE3
-ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
-#endif
#ifdef HAS_I444TOARGBROW_SSSE3
-ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
+#ifdef HAS_I422TORAWROW_AVX2
+ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_J422TOARGBROW_SSSE3
+ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_J422TOARGBROW_AVX2
+ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
#ifdef HAS_I422TOARGBROW_AVX2
-ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#endif
#ifdef HAS_I422TORGBAROW_AVX2
-ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#endif
#ifdef HAS_I444TOARGBROW_AVX2
-ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
-ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
-ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
-ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
+ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
-#undef ANY31C
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#undef ANY31
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
@@ -186,6 +136,32 @@
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+
// Merge functions.
#ifdef HAS_MERGEUVROW_SSE2
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
@@ -245,55 +221,6 @@
#endif
#undef ANY21
-// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
- uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-
-// Biplanar to RGB.
-#ifdef HAS_NV12TOARGBROW_SSSE3
-ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TOARGBROW_AVX2
-ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV12TOARGBROW_NEON
-ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_SSSE3
-ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV21TOARGBROW_AVX2
-ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
-#endif
-#ifdef HAS_NV21TOARGBROW_NEON
-ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_SSSE3
-ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
-#endif
-#ifdef HAS_NV12TORGB565ROW_AVX2
-ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
-#endif
-#ifdef HAS_NV12TORGB565ROW_NEON
-ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
-#endif
-#undef ANY21C
-
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
@@ -325,10 +252,8 @@
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
-ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
-#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif
@@ -344,7 +269,9 @@
#if defined(HAS_I400TOARGBROW_AVX2)
ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
@@ -351,9 +278,6 @@
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
-#if defined(HAS_RAWTORGB24ROW_SSSE3)
-ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
-#endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
@@ -363,6 +287,10 @@
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
@@ -371,10 +299,9 @@
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
-#if defined(HAS_RAWTORGB24ROW_NEON)
-ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
-#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
@@ -454,6 +381,9 @@
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
+#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
#endif
@@ -466,44 +396,8 @@
#ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
-ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
-#endif
#undef ANY11
-// Any 1 to 1 blended. Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
- memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
- ANY_SIMD(temp, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-
-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
-ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
-ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
-ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
-#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
-ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
-#endif
-#undef ANY11B
-
// Any 1 to 1 with parameter.
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
@@ -546,35 +440,6 @@
#endif
#undef ANY11P
-// Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
- memset(temp, 0, 128); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
- ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
- }
-#if defined(HAS_YUY2TOARGBROW_SSSE3)
-ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
-ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
-#endif
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
-ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
-#endif
-#if defined(HAS_YUY2TOARGBROW_NEON)
-ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
-ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
-#endif
-#undef ANY11C
-
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
@@ -599,11 +464,14 @@
#ifdef HAS_INTERPOLATEROW_SSSE3
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
+#endif
#ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif
-#ifdef HAS_INTERPOLATEROW_DSPR2
-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
#endif
#undef ANY11T
@@ -628,6 +496,9 @@
#ifdef HAS_MIRRORROW_SSSE3
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
+#ifdef HAS_MIRRORROW_SSE2
+ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
+#endif
#ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif
@@ -677,26 +548,10 @@
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- /* repeat last 4 bytes for 422 subsampler */ \
- if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
+ if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ temp + SS(r, UVSHIFT) * BPP - BPP, 4); \
} \
- /* repeat last 4 - 12 bytes for 411 subsampler */ \
- if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
- } \
- if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
- } \
- if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
- memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
- } \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
@@ -711,8 +566,8 @@
#ifdef HAS_SPLITUVROW_NEON
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#endif
-#ifdef HAS_SPLITUVROW_DSPR2
-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -721,6 +576,9 @@
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
#endif
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
+#endif
#ifdef HAS_YUY2TOUV422ROW_SSE2
ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
@@ -727,6 +585,7 @@
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
@@ -748,11 +607,11 @@
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
- if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
+ if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
- temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ temp + SS(r, UVSHIFT) * BPP - BPP, 4); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
- temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \
} \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
@@ -761,9 +620,6 @@
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-#endif
-#ifdef HAS_ARGBTOUVJROW_AVX2
-ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -100,20 +100,6 @@
}
}
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
- int x;
- for (x = 0; x < width; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_rgb24[0] = b;
- dst_rgb24[1] = g;
- dst_rgb24[2] = r;
- dst_rgb24 += 3;
- src_raw += 3;
- }
-}
-
void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -433,6 +419,28 @@
MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToUJ(ar, ag, ab);
+ dst_v[0] = RGBToVJ(ar, ag, ab);
+ src_argb += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToUJ(ar, ag, ab);
+ dst_v[0] = RGBToVJ(ar, ag, ab);
+ }
+}
+
void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -636,6 +644,28 @@
}
}
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
void ARGBToUV411Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width) {
int x;
@@ -649,11 +679,10 @@
dst_u += 1;
dst_v += 1;
}
- // Odd width handling mimics 'any' function which replicates last pixel.
if ((width & 3) == 3) {
- uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
- uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
- uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
} else if ((width & 3) == 2) {
@@ -965,9 +994,6 @@
}
}
-// TODO(fbarchard): Unify these structures to be platform independent.
-// TODO(fbarchard): Generate SIMD structures from float matrix.
-
// BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
@@ -974,6 +1000,7 @@
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
@@ -984,76 +1011,36 @@
#define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
+#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
+ *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
+ *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
+}
-#undef BB
-#undef BG
-#undef BR
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32)(y1 + YGB) >> 6);
+ *g = Clamp((int32)(y1 + YGB) >> 6);
+ *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
+#undef BB
+#undef BG
+#undef BR
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
@@ -1061,229 +1048,40 @@
// * B = Y - U * -1.77200
// Y contribution to R,G,B. Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32 /* 64 / 2 */
// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414 * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414 * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef YG
-
-// BT.709 YUV to RGB reference
-// * R = Y - V * -1.28033
-// * G = Y - U * 0.21482 - V * 0.38059
-// * B = Y - U * -2.12798
-
-// Y contribution to R,G,B. Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
-
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
-// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059 * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { UG, VG, UG, VG, UG, VG, UG, VG },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { VG, UG, VG, UG, VG, UG, VG, UG },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
- { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BB, BG, BR, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
- { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
- { BR, BG, BB, 0, 0, 0, 0, 0 },
- { 0x0101 * YG, 0, 0, 0 }
-};
-#else
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
- { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
- { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
- { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
- { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
- { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
- { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
- { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
- { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-#endif
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-#undef YG
-
// C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
- uint8* b, uint8* g, uint8* r,
- const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32)(-(u * ub ) + y1 + bb) >> 6);
- *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
- *r = Clamp((int32)(-( v * vr) + y1 + br) >> 6);
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
+ *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
+ *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
+ *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
}
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
- uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32)(y1 + YGB) >> 6);
- *g = Clamp((int32)(y1 + YGB) >> 6);
- *r = Clamp((int32)(y1 + YGB) >> 6);
-}
-
-#undef YG
-#undef YGB
-
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
@@ -1292,17 +1090,14 @@
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
- yuvconstants);
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
- YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
- yuvconstants);
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
src_u += 2;
@@ -1311,8 +1106,7 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
#else
@@ -1320,12 +1114,11 @@
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width; ++x) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
src_y += 1;
src_u += 1;
@@ -1340,15 +1133,14 @@
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
@@ -1357,36 +1149,33 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
-void I422AlphaToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
+void J422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = src_a[0];
- YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
- rgb_buf[7] = src_a[1];
+ YuvJPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvJPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
- src_a += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
- rgb_buf[3] = src_a[0];
+ YuvJPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
}
}
@@ -1394,14 +1183,13 @@
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+ rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1409,15 +1197,36 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
}
}
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ }
+}
+
void I422ToARGB4444Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
uint8 b0;
uint8 g0;
@@ -1427,8 +1236,8 @@
uint8 r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
@@ -1443,7 +1252,7 @@
dst_argb4444 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
@@ -1456,7 +1265,6 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
uint8 b0;
uint8 g0;
@@ -1466,8 +1274,8 @@
uint8 r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
@@ -1482,7 +1290,7 @@
dst_argb1555 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
@@ -1495,7 +1303,6 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
uint8 b0;
uint8 g0;
@@ -1505,8 +1312,8 @@
uint8 r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
- YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -1521,7 +1328,7 @@
dst_rgb565 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -1533,21 +1340,20 @@
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 3; x += 4) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
YuvPixel(src_y[2], src_u[0], src_v[0],
- rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
+ rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
rgb_buf[11] = 255;
YuvPixel(src_y[3], src_u[0], src_v[0],
- rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
+ rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
rgb_buf[15] = 255;
src_y += 4;
src_u += 1;
@@ -1556,10 +1362,10 @@
}
if (width & 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
@@ -1566,7 +1372,7 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@@ -1574,15 +1380,14 @@
void NV12ToARGBRow_C(const uint8* src_y,
const uint8* src_uv,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_uv[0], src_uv[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], src_uv[0], src_uv[1],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
src_uv += 2;
@@ -1590,7 +1395,7 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_uv[0], src_uv[1],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@@ -1598,16 +1403,17 @@
void NV21ToARGBRow_C(const uint8* src_y,
const uint8* src_vu,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
+
YuvPixel(src_y[1], src_vu[1], src_vu[0],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
+
src_y += 2;
src_vu += 2;
rgb_buf += 8; // Advance 2 pixels.
@@ -1614,7 +1420,7 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_vu[1], src_vu[0],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@@ -1622,7 +1428,6 @@
void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
uint8 b0;
uint8 g0;
@@ -1632,8 +1437,8 @@
uint8 r1;
int x;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
- YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -1647,7 +1452,7 @@
dst_rgb565 += 4; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@@ -1655,17 +1460,51 @@
}
}
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* vsrc_u,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ vsrc_u += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
void YUY2ToARGBRow_C(const uint8* src_yuy2,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_yuy2 += 4;
rgb_buf += 8; // Advance 2 pixels.
@@ -1672,7 +1511,7 @@
}
if (width & 1) {
YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@@ -1679,15 +1518,14 @@
void UYVYToARGBRow_C(const uint8* src_uyvy,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
- rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_uyvy += 4;
rgb_buf += 8; // Advance 2 pixels.
@@ -1694,24 +1532,73 @@
}
if (width & 1) {
YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
- rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ }
+}
+
void I422ToRGBARow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
rgb_buf[0] = 255;
YuvPixel(src_y[1], src_u[0], src_v[0],
- rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+ rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
rgb_buf[4] = 255;
src_y += 2;
src_u += 1;
@@ -1720,7 +1607,7 @@
}
if (width & 1) {
YuvPixel(src_y[0], src_u[0], src_v[0],
- rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
rgb_buf[0] = 255;
}
}
@@ -1972,25 +1859,6 @@
}
}
#undef BLEND
-
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
- dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
- src0 += 2;
- src1 += 2;
- alpha += 2;
- dst += 2;
- }
- if (width & 1) {
- dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
- }
-}
-#undef UBLEND
-
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
// Multiply source RGB by alpha and store to destination.
@@ -2147,18 +2015,18 @@
}
// Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
- uint8* dst_uv, int width) {
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
int x;
- for (x = 0; x < width; ++x) {
+ for (x = 0; x < pix; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
}
}
-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
- uint16* dst_uv, int width) {
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+ uint16* dst_uv, int pix) {
int x;
- for (x = 0; x < width; ++x) {
+ for (x = 0; x < pix; ++x) {
dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
}
}
@@ -2167,30 +2035,27 @@
void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride,
int width, int source_y_fraction) {
- int y1_fraction = source_y_fraction ;
+ int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
int x;
- if (y1_fraction == 0) {
+ if (source_y_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
return;
}
- if (y1_fraction == 128) {
- HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+ if (source_y_fraction == 128) {
+ HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
return;
}
for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] =
- (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
- dst_ptr[1] =
- (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
src_ptr += 2;
src_ptr1 += 2;
dst_ptr += 2;
}
if (width & 1) {
- dst_ptr[0] =
- (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
}
}
@@ -2206,7 +2071,7 @@
return;
}
if (source_y_fraction == 128) {
- HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+ HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
return;
}
for (x = 0; x < width - 1; x += 2) {
@@ -2223,7 +2088,7 @@
// Use first 4 shuffler values to reorder ARGB channels.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
int index0 = shuffler[0];
int index1 = shuffler[1];
int index2 = shuffler[2];
@@ -2230,7 +2095,7 @@
int index3 = shuffler[3];
// Shuffle a row of ARGB.
int x;
- for (x = 0; x < width; ++x) {
+ for (x = 0; x < pix; ++x) {
// To support in-place conversion.
uint8 b = src_argb[index0];
uint8 g = src_argb[index1];
@@ -2291,126 +2156,10 @@
}
}
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
- uint8* dst_argb,
- const float* poly,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- float b = (float)(src_argb[0]);
- float g = (float)(src_argb[1]);
- float r = (float)(src_argb[2]);
- float a = (float)(src_argb[3]);
- float b2 = b * b;
- float g2 = g * g;
- float r2 = r * r;
- float a2 = a * a;
- float db = poly[0] + poly[4] * b;
- float dg = poly[1] + poly[5] * g;
- float dr = poly[2] + poly[6] * r;
- float da = poly[3] + poly[7] * a;
- float b3 = b2 * b;
- float g3 = g2 * g;
- float r3 = r2 * r;
- float a3 = a2 * a;
- db += poly[8] * b2;
- dg += poly[9] * g2;
- dr += poly[10] * r2;
- da += poly[11] * a2;
- db += poly[12] * b3;
- dg += poly[13] * g3;
- dr += poly[14] * r3;
- da += poly[15] * a3;
-
- dst_argb[0] = Clamp((int32)(db));
- dst_argb[1] = Clamp((int32)(dg));
- dst_argb[2] = Clamp((int32)(dr));
- dst_argb[3] = Clamp((int32)(da));
- src_argb += 4;
- dst_argb += 4;
- }
-}
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
- const uint8* luma, uint32 lumacoeff) {
- uint32 bc = lumacoeff & 0xff;
- uint32 gc = (lumacoeff >> 8) & 0xff;
- uint32 rc = (lumacoeff >> 16) & 0xff;
-
- int i;
- for (i = 0; i < width - 1; i += 2) {
- // Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
- const uint8* luma1;
- dst_argb[0] = luma0[src_argb[0]];
- dst_argb[1] = luma0[src_argb[1]];
- dst_argb[2] = luma0[src_argb[2]];
- dst_argb[3] = src_argb[3];
- luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
- src_argb[6] * rc) & 0x7F00u) + luma;
- dst_argb[4] = luma1[src_argb[4]];
- dst_argb[5] = luma1[src_argb[5]];
- dst_argb[6] = luma1[src_argb[6]];
- dst_argb[7] = src_argb[7];
- src_argb += 8;
- dst_argb += 8;
- }
- if (width & 1) {
- // Luminance in rows, color values in columns.
- const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
- src_argb[2] * rc) & 0x7F00u) + luma;
- dst_argb[0] = luma0[src_argb[0]];
- dst_argb[1] = luma0[src_argb[1]];
- dst_argb[2] = luma0[src_argb[2]];
- dst_argb[3] = src_argb[3];
- }
-}
-
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- dst[3] = src[3];
- dst[7] = src[7];
- dst += 8;
- src += 8;
- }
- if (width & 1) {
- dst[3] = src[3];
- }
-}
-
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- dst_a[0] = src_argb[3];
- dst_a[1] = src_argb[7];
- dst_a += 2;
- src_argb += 8;
- }
- if (width & 1) {
- dst_a[0] = src_argb[3];
- }
-}
-
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
- int i;
- for (i = 0; i < width - 1; i += 2) {
- dst[3] = src[0];
- dst[7] = src[1];
- dst += 8;
- src += 2;
- }
- if (width & 1) {
- dst[3] = src[0];
- }
-}
-
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
void I422ToRGB565Row_SSSE3(const uint8* src_y,
@@ -2417,12 +2166,11 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
src_y += twidth;
src_u += twidth / 2;
@@ -2438,13 +2186,12 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
src_y += twidth;
src_u += twidth / 2;
@@ -2460,13 +2207,12 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
src_y += twidth;
src_u += twidth / 2;
@@ -2478,16 +2224,13 @@
#endif
#if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+ uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
src_y += twidth;
src_uv += twidth;
@@ -2497,22 +2240,70 @@
}
#endif
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+ YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+ src_yuy2 += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+ UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+ src_uyvy += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86)
+
#if defined(HAS_I422TORGB565ROW_AVX2)
void I422ToRGB565Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
- ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
@@ -2527,18 +2318,13 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
-#else
- ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
-#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
@@ -2553,18 +2339,13 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
-#else
- ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
-#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
@@ -2579,13 +2360,12 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
@@ -2597,22 +2377,37 @@
}
#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+void I422ToRAWRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+ // TODO(fbarchard): ARGBToRAWRow_AVX2
+ ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_raw += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
#if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+ uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
- NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
-#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
-#else
- ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
-#endif
src_y += twidth;
src_uv += twidth;
dst_rgb565 += twidth * 2;
@@ -2620,6 +2415,160 @@
}
}
#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+ uint8* dst_rgb565, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+ YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+ I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+ src_yuy2 += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+ // Row buffers for intermediate YUV pixels.
+ SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+ SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+ SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+ UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+ I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+ src_uyvy += twidth * 2;
+ dst_argb += twidth * 4;
+ width -= twidth;
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86)
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float b = (float)(src_argb[0]);
+ float g = (float)(src_argb[1]);
+ float r = (float)(src_argb[2]);
+ float a = (float)(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp((int32)(db));
+ dst_argb[1] = Clamp((int32)(dg));
+ dst_argb[2] = Clamp((int32)(dr));
+ dst_argb[3] = Clamp((int32)(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uint32 bc = lumacoeff & 0xff;
+ uint32 gc = (lumacoeff >> 8) & 0xff;
+ uint32 rc = (lumacoeff >> 16) & 0xff;
+
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ const uint8* luma1;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+ src_argb[6] * rc) & 0x7F00u) + luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
#ifdef __cplusplus
} // extern "C"
--- a/third_party/libyuv/source/row_gcc.cc
+++ b/third_party/libyuv/source/row_gcc.cc
@@ -17,8 +17,7 @@
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
@@ -121,24 +120,6 @@
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-// Shuffle table for converting RAW to RGB24. First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24. Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24. Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
// Shuffle table for converting ARGB to RGB24.
static uvec8 kShuffleMaskARGBToRGB24 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
@@ -154,39 +135,109 @@
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
};
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
+#endif // HAS_RGB24TOARGBROW_SSSE3
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 5 \n"
+ "mov %%eax,%%eax \n"
+ "mov %%ebx,%%ebx \n"
+ "mov %%ecx,%%ecx \n"
+ "mov %%edx,%%edx \n"
+ "mov %%esi,%%esi \n"
+ "mov %%edi,%%edi \n"
+ "mov %%ebp,%%ebp \n"
+ "mov %%esp,%%esp \n"
+ ".p2align 5 \n"
+ "mov %%r8d,%%r8d \n"
+ "mov %%r9d,%%r9d \n"
+ "mov %%r10d,%%r10d \n"
+ "mov %%r11d,%%r11d \n"
+ "mov %%r12d,%%r12d \n"
+ "mov %%r13d,%%r13d \n"
+ "mov %%r14d,%%r14d \n"
+ "mov %%r15d,%%r15d \n"
+ ".p2align 5 \n"
+ "lea (%%rax),%%eax \n"
+ "lea (%%rbx),%%ebx \n"
+ "lea (%%rcx),%%ecx \n"
+ "lea (%%rdx),%%edx \n"
+ "lea (%%rsi),%%esi \n"
+ "lea (%%rdi),%%edi \n"
+ "lea (%%rbp),%%ebp \n"
+ "lea (%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea (%%r8),%%r8d \n"
+ "lea (%%r9),%%r9d \n"
+ "lea (%%r10),%%r10d \n"
+ "lea (%%r11),%%r11d \n"
+ "lea (%%r12),%%r12d \n"
+ "lea (%%r13),%%r13d \n"
+ "lea (%%r14),%%r14d \n"
+ "lea (%%r15),%%r15d \n"
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+ ".p2align 5 \n"
+ "lea 0x10(%%rax),%%eax \n"
+ "lea 0x10(%%rbx),%%ebx \n"
+ "lea 0x10(%%rcx),%%ecx \n"
+ "lea 0x10(%%rdx),%%edx \n"
+ "lea 0x10(%%rsi),%%esi \n"
+ "lea 0x10(%%rdi),%%edi \n"
+ "lea 0x10(%%rbp),%%ebp \n"
+ "lea 0x10(%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea 0x10(%%r8),%%r8d \n"
+ "lea 0x10(%%r9),%%r9d \n"
+ "lea 0x10(%%r10),%%r10d \n"
+ "lea 0x10(%%r11),%%r11d \n"
+ "lea 0x10(%%r12),%%r12d \n"
+ "lea 0x10(%%r13),%%r13d \n"
+ "lea 0x10(%%r14),%%r14d \n"
+ "lea 0x10(%%r15),%%r15d \n"
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+ ".p2align 5 \n"
+ "add 0x10,%%eax \n"
+ "add 0x10,%%ebx \n"
+ "add 0x10,%%ecx \n"
+ "add 0x10,%%edx \n"
+ "add 0x10,%%esi \n"
+ "add 0x10,%%edi \n"
+ "add 0x10,%%ebp \n"
+ "add 0x10,%%esp \n"
+ ".p2align 5 \n"
+ "add 0x10,%%r8d \n"
+ "add 0x10,%%r9d \n"
+ "add 0x10,%%r10d \n"
+ "add 0x10,%%r11d \n"
+ "add 0x10,%%r12d \n"
+ "add 0x10,%%r13d \n"
+ "add 0x10,%%r14d \n"
+ "add 0x10,%%r15d \n"
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-#endif // HAS_RGB24TOARGBROW_SSSE3
+ ".p2align 2 \n"
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5"
+ );
+}
+#endif // TESTING
#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
@@ -207,7 +258,7 @@
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
@@ -214,7 +265,7 @@
#endif // HAS_J400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
@@ -246,13 +297,13 @@
"jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kShuffleMaskRGB24ToARGB) // %3
: "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
@@ -284,44 +335,14 @@
"jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kShuffleMaskRAWToARGB) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
- "lea " MEMLEA(0x18,0) ",%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
- "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRAWToRGB24_0), // %3
- "m"(kShuffleMaskRAWToRGB24_1), // %4
- "m"(kShuffleMaskRAWToRGB24_2) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
"mov $0x1080108,%%eax \n"
"movd %%eax,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
@@ -361,7 +382,7 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc", "eax", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
@@ -368,7 +389,7 @@
);
}
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"mov $0x1080108,%%eax \n"
"movd %%eax,%%xmm5 \n"
@@ -412,7 +433,7 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc", "eax", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
@@ -419,7 +440,7 @@
);
}
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"mov $0xf0f0f0f,%%eax \n"
"movd %%eax,%%xmm4 \n"
@@ -450,7 +471,7 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc", "eax", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -457,7 +478,7 @@
);
}
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
LABELALIGN
@@ -489,13 +510,13 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kShuffleMaskARGBToRGB24) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
LABELALIGN
@@ -527,13 +548,13 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kShuffleMaskARGBToRAW) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"pcmpeqb %%xmm3,%%xmm3 \n"
"psrld $0x1b,%%xmm3 \n"
@@ -564,105 +585,13 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
- const uint32 dither4, int width) {
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
- const uint32 dither4, int width) {
- asm volatile (
- "vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(dither4) // %3
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-#endif // HAS_ARGBTORGB565DITHERROW_AVX2
-
-
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrld $0x1b,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
@@ -696,13 +625,13 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psllw $0xc,%%xmm4 \n"
@@ -725,7 +654,7 @@
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
@@ -733,7 +662,7 @@
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@@ -760,7 +689,7 @@
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -771,7 +700,7 @@
#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@@ -799,7 +728,7 @@
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -814,7 +743,7 @@
};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
@@ -844,7 +773,7 @@
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kARGBToY), // %3
"m"(kAddY16), // %4
"m"(kPermdARGBToY_AVX) // %5
@@ -855,7 +784,7 @@
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
@@ -886,7 +815,7 @@
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kARGBToYJ), // %3
"m"(kAddYJ64), // %4
"m"(kPermdARGBToY_AVX) // %5
@@ -1023,67 +952,6 @@
}
#endif // HAS_ARGBTOUVROW_AVX2
-#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- asm volatile (
- "vbroadcastf128 %5,%%ymm5 \n"
- "vbroadcastf128 %6,%%ymm6 \n"
- "vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
- VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
- VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
- VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
- VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
- "lea " MEMLEA(0x80,0) ",%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
-
- "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUVJ128), // %5
- "m"(kARGBToVJ), // %6
- "m"(kARGBToUJ), // %7
- "m"(kShufARGBToUV_AVX) // %8
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-#endif // HAS_ARGBTOUVJROW_AVX2
-
#ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
@@ -1205,8 +1073,61 @@
}
#endif // HAS_ARGBTOUV444ROW_SSSE3
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBTOUV422ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
LABELALIGN
@@ -1232,7 +1153,7 @@
"jg 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kBGRAToY), // %3
"m"(kAddY16) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1300,7 +1221,7 @@
);
}
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
@@ -1327,7 +1248,7 @@
"jg 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1334,7 +1255,7 @@
);
}
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
@@ -1361,7 +1282,7 @@
"jg 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "m"(kRGBAToY), // %3
"m"(kAddY16) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1492,15 +1413,132 @@
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
-// Read 8 UV from 444
+struct YuvConstants {
+ lvec8 kUVToB; // 0
+ lvec8 kUVToG; // 32
+ lvec8 kUVToR; // 64
+ lvec16 kUVBiasB; // 96
+ lvec16 kUVBiasG; // 128
+ lvec16 kUVBiasR; // 160
+ lvec16 kYToRgb; // 192
+};
+
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414 * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+ { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+ { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+ { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+ { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+ { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+ { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+ { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// Read 8 UV from 411
#define READYUV444 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
@@ -1508,105 +1546,44 @@
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
-// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411 \
"movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
- "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
-
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
-// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-// __asm pinsrw xmm0, [esi], 0 /* U */
-// __asm pinsrw xmm1, [esi + edi], 0 /* V */
-#define READYUV411_TEMP \
- "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
- "movd %[temp],%%xmm0 \n" \
- MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
- "movd %[temp],%%xmm1 \n" \
"lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
- "punpckldq %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
// Read 4 UV from NV12, upsample to 8 UV
#define READNV12 \
"movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
"lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
-// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21 \
- "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
- "pshufb %[kShuffleNV21], %%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
-
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2 \
- "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
- "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
- "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
- "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
- "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
-
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY \
- "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
- "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
- "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
- "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
- "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants) \
- "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
- "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
- "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
- "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
- "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
- "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
- "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
+#define YUVTORGB(YuvConstants) \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \
- "movdqa %%xmm11,%%xmm0 \n" \
- "pmaddubsw %%xmm8,%%xmm1 \n" \
+ "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
"psubw %%xmm1,%%xmm0 \n" \
- "movdqa %%xmm12,%%xmm1 \n" \
- "pmaddubsw %%xmm9,%%xmm2 \n" \
+ "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
"psubw %%xmm2,%%xmm1 \n" \
- "movdqa %%xmm13,%%xmm2 \n" \
- "pmaddubsw %%xmm10,%%xmm3 \n" \
+ "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
"psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw %%xmm14,%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "punpcklbw %%xmm3,%%xmm3 \n" \
+ "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
"psraw $0x6,%%xmm0 \n" \
"psraw $0x6,%%xmm1 \n" \
"psraw $0x6,%%xmm2 \n" \
@@ -1613,39 +1590,8 @@
"packuswb %%xmm0,%%xmm0 \n" \
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
-#define YUVTORGB_REGS \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-#else
-#define YUVTORGB_SETUP(yuvconstants)
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
- "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
-#define YUVTORGB_REGS
-#endif
-
-// Store 8 ARGB values.
+// Store 8 ARGB values. Assumes XMM5 is zero.
#define STOREARGB \
"punpcklbw %%xmm1,%%xmm0 \n" \
"punpcklbw %%xmm5,%%xmm2 \n" \
@@ -1656,7 +1602,30 @@
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
"lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
-// Store 8 RGBA values.
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm0,%%xmm1 \n" \
+ "punpcklbw %%xmm2,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
+ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
+ "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR \
+ "punpcklbw %%xmm1,%%xmm2 \n" \
+ "punpcklbw %%xmm5,%%xmm0 \n" \
+ "movdqa %%xmm2,%%xmm1 \n" \
+ "punpcklwd %%xmm0,%%xmm2 \n" \
+ "punpckhwd %%xmm0,%%xmm1 \n" \
+ "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
+ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
+ "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
#define STORERGBA \
"pcmpeqb %%xmm5,%%xmm5 \n" \
"punpcklbw %%xmm2,%%xmm1 \n" \
@@ -1672,16 +1641,14 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV444
- YUVTORGB(yuvconstants)
+ YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -1690,20 +1657,19 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
+// TODO(fbarchard): Consider putting masks into constants.
void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
"movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
"sub %[u_buf],%[v_buf] \n"
@@ -1710,7 +1676,7 @@
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB(yuvconstants)
+ YUVTORGB(kYuvConstants)
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm2,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -1728,33 +1694,76 @@
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
#if defined(__i386__) && defined(__pic__)
[width]"+m"(width) // %[width]
#else
[width]"+rm"(width) // %[width]
#endif
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
[kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
[kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
);
}
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+ "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_raw]"+r"(dst_raw), // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+ [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+ );
+}
+
void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB(yuvconstants)
+ YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -1763,95 +1772,74 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- READYUVA422
- YUVTORGB(yuvconstants)
+ READYUV422
+ YUVTORGB(kYuvConstants)
STOREARGB
- "subl $0x8,%[width] \n"
+ "sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
- [a_buf]"+r"(a_buf), // %[a_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
- [width]"+m"(width) // %[width]
-#else
[width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif // HAS_I422ALPHATOARGBROW_SSSE3
-#ifdef HAS_I411TOARGBROW_SSSE3
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
- int temp;
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- READYUV411_TEMP
- YUVTORGB(yuvconstants)
+ READYUV411
+ YUVTORGB(kYuvConstants)
STOREARGB
- "subl $0x8,%[width] \n"
+ "sub $0x8,%[width] \n"
"jg 1b \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [u_buf]"+r"(u_buf), // %[u_buf]
- [v_buf]"+r"(v_buf), // %[v_buf]
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
- [temp]"=&r"(temp), // %[temp]
-#if defined(__i386__) && defined(__pic__)
- [width]"+m"(width) // %[width]
-#else
- [width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif
void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READNV12
- YUVTORGB(yuvconstants)
+ YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -1859,85 +1847,84 @@
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ // Does not use r14.
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
+ const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- READNV21
- YUVTORGB(yuvconstants)
+ READNV12
+ YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
- [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
+ // Does not use r14.
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- READYUY2
- YUVTORGB(yuvconstants)
- STOREARGB
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ STOREBGRA
"sub $0x8,%[width] \n"
"jg 1b \n"
- : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
- [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- READUYVY
- YUVTORGB(yuvconstants)
- STOREARGB
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ STOREABGR
"sub $0x8,%[width] \n"
"jg 1b \n"
- : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleUYVYY]"m"(kShuffleUYVYY),
- [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@@ -1945,16 +1932,14 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422
- YUVTORGB(yuvconstants)
+ YUVTORGB(kYuvConstants)
STORERGBA
"sub $0x8,%[width] \n"
"jg 1b \n"
@@ -1963,191 +1948,76 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_I422TOARGBROW_SSSE3
-// Read 16 UV from 444
-#define READYUV444_AVX2 \
- "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
-
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
-// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
- "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
- "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
- "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 \
- "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
-
-// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
- "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
-
-// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
- "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
- "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
- "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
- "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
- "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
- "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
- "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
- "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
- "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
- "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
- "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
-
-#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
- "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
- "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
- "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
- "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
- "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
- "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
- "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
-#define YUVTORGB_AVX2(yuvconstants) \
- "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
- "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
- "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
- "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
- "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
- "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
- "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
-#define YUVTORGB_REGS_AVX2 \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-#else // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants) \
- "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
- "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
- "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) \
+ "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
+ "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
- "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
+ "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
- "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
+ "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
- "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
-#define YUVTORGB_REGS_AVX2
-#endif
+ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
-// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
- "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
- "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
- "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
- "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
- "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
- "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
-
-#ifdef HAS_I444TOARGBROW_AVX2
+#if defined(HAS_I422TOBGRAROW_AVX2)
// 16 pixels
-// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+ uint8* dst_bgra,
int width) {
asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUV444_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into BGRA
+ "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
+
+ "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
@@ -2154,33 +2024,42 @@
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif // HAS_I444TOARGBROW_AVX2
+#endif // HAS_I422TOBGRAROW_AVX2
-#ifdef HAS_I411TOARGBROW_AVX2
+#if defined(HAS_I422TOARGBROW_AVX2)
// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUV411_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ARGB
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
+
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
@@ -2189,31 +2068,40 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif // HAS_I411TOARGBROW_AVX2
+#endif // HAS_I422TOARGBROW_AVX2
-#if defined(HAS_I422TOARGBROW_AVX2)
+#if defined(HAS_J422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ARGB
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
+
+ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
@@ -2222,50 +2110,53 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif // HAS_I422TOARGBROW_AVX2
+#endif // HAS_J422TOARGBROW_AVX2
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+#if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- const uint8* a_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- READYUVA422_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "subl $0x10,%[width] \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+
+ // Step 3: Weave into ABGR
+ "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
+ "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
+ "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
- [a_buf]"+r"(a_buf), // %[a_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
- [width]"+m"(width) // %[width]
-#else
[width]"+rm"(width) // %[width]
-#endif
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
-#endif // HAS_I422ALPHATOARGBROW_AVX2
+#endif // HAS_I422TOABGRROW_AVX2
#if defined(HAS_I422TORGBAROW_AVX2)
// 16 pixels
@@ -2274,16 +2165,14 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
- YUVTORGB_AVX2(yuvconstants)
+ YUVTORGB_AVX2(kYuvConstants)
// Step 3: Weave into RGBA
"vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
@@ -2303,134 +2192,13 @@
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_I422TORGBAROW_AVX2
-#if defined(HAS_NV12TOARGBROW_AVX2)
-// 16 pixels.
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- READNV12_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [uv_buf]"+r"(uv_buf), // %[uv_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_NV12TOARGBROW_AVX2
-
-#if defined(HAS_NV21TOARGBROW_AVX2)
-// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- READNV21_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [y_buf]"+r"(y_buf), // %[y_buf]
- [vu_buf]"+r"(vu_buf), // %[vu_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_NV21TOARGBROW_AVX2
-
-#if defined(HAS_YUY2TOARGBROW_AVX2)
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- READYUY2_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
- [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
- "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_YUY2TOARGBROW_AVX2
-
-#if defined(HAS_UYVYTOARGBROW_AVX2)
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- asm volatile (
- YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- READUYVY_AVX2
- YUVTORGB_AVX2(yuvconstants)
- STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
- : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
- [dst_argb]"+r"(dst_argb), // %[dst_argb]
- [width]"+rm"(width) // %[width]
- : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
- [kShuffleUYVYY]"m"(kShuffleUYVYY),
- [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_UYVYTOARGBROW_AVX2
-
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
@@ -2576,7 +2344,35 @@
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
+ "movdqa %%xmm0,%%xmm1 \n"
+ "psllw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufd $0x4e,%%xmm0,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1)",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1"
+ );
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
@@ -2607,7 +2403,7 @@
"xmm0", "xmm1"
);
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
@@ -2662,8 +2458,7 @@
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -2690,7 +2485,7 @@
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
@@ -2699,8 +2494,7 @@
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -2726,7 +2520,7 @@
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
@@ -2797,23 +2591,8 @@
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
- LABELALIGN
- "2: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
@@ -2821,8 +2600,7 @@
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
- "jg 2b \n"
- "9: \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2
@@ -2936,33 +2714,6 @@
}
#endif // HAS_ARGBCOPYALPHAROW_AVX2
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ", %%xmm0 \n"
- "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
- "lea " MEMLEA(0x20, 0) ", %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8, 1) ", %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
-}
-#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
-
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
@@ -3035,7 +2786,7 @@
#ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width >> 2);
- const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
asm volatile (
"rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0
@@ -3066,7 +2817,7 @@
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3084,7 +2835,7 @@
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
@@ -3092,7 +2843,7 @@
}
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3122,7 +2873,7 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
: "r"((intptr_t)(stride_yuy2)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
@@ -3130,7 +2881,7 @@
}
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3156,7 +2907,7 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3163,7 +2914,7 @@
);
}
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
@@ -3179,7 +2930,7 @@
"jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1"
@@ -3187,7 +2938,7 @@
}
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3217,7 +2968,7 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
: "r"((intptr_t)(stride_uyvy)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
@@ -3225,7 +2976,7 @@
}
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
@@ -3251,7 +3002,7 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3260,7 +3011,7 @@
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -3280,7 +3031,7 @@
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
@@ -3288,7 +3039,7 @@
}
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -3319,7 +3070,7 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
: "r"((intptr_t)(stride_yuy2)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3327,7 +3078,7 @@
}
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -3356,7 +3107,7 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3363,7 +3114,7 @@
);
}
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
@@ -3381,7 +3132,7 @@
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
@@ -3388,7 +3139,7 @@
);
}
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -3420,7 +3171,7 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
: "r"((intptr_t)(stride_uyvy)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3428,7 +3179,7 @@
}
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrlw $0x8,%%ymm5,%%ymm5 \n"
@@ -3457,7 +3208,7 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
@@ -3465,6 +3216,92 @@
}
#endif // HAS_YUY2TOYROW_AVX2
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "41: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
static uvec8 kShuffleAlpha = {
@@ -3473,6 +3310,15 @@
};
// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3,0F5h // 8 alpha words
+// pshuflw xmm3, xmm3,0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -3553,113 +3399,50 @@
}
#endif // HAS_ARGBBLENDROW_SSSE3
-#ifdef HAS_BLENDPLANEROW_SSSE3
-// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x8,%%xmm5 \n"
- // 8 pixel loop.
+ // 4 pixel loop.
LABELALIGN
"1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pshufhw $0xff,%%xmm1,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x4,%2 \n"
"jg 1b \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
-#endif // HAS_BLENDPLANEROW_SSSE3
+#endif // HAS_ARGBATTENUATEROW_SSE2
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
- "vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
- "vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
-
- // 32 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src0), // %0
- "+r"(src1), // %1
- "+r"(alpha), // %2
- "+r"(dst), // %3
- "+rm"(width) // %4
- :: "memory", "cc", "eax",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
-}
-#endif // HAS_BLENDPLANEROW_AVX2
-
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
static uvec8 kShuffleAlpha0 = {
@@ -3759,7 +3542,7 @@
// Unattenuate 4 pixels at a time.
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
- uintptr_t alpha;
+ uintptr_t alpha = 0;
asm volatile (
// 4 pixel loop.
LABELALIGN
@@ -3790,10 +3573,10 @@
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
: "r"(fixed_invtbl8) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -3809,7 +3592,7 @@
// Unattenuate 8 pixels at a time.
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
int width) {
- uintptr_t alpha;
+ uintptr_t alpha = 0;
asm volatile (
"sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
@@ -3858,10 +3641,10 @@
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
: "r"(fixed_invtbl8), // %4
"m"(kUnattenShuffleAlpha_AVX2) // %5
: "memory", "cc", NACL_R14
@@ -4786,7 +4569,7 @@
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* src_dudv, int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
- intptr_t temp;
+ intptr_t temp = 0;
asm volatile (
"movq " MEMACCESS(3) ",%%xmm2 \n"
"movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
@@ -4858,7 +4641,7 @@
"+r"(dst_argb), // %2
"+r"(src_dudv), // %3
"+rm"(width), // %4
- "=&r"(temp) // %5
+ "+r"(temp) // %5
:
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
@@ -4873,21 +4656,23 @@
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
+ "shr %3 \n"
"cmp $0x0,%3 \n"
"je 100f \n"
- "cmp $0x80,%3 \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
"je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
- "add $0x100,%3 \n"
+ "add $0x80,%3 \n"
"movd %3,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
@@ -4894,26 +4679,33 @@
"1: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm2)
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
// Blend 50 / 50.
LABELALIGN
"50: \n"
@@ -4926,6 +4718,19 @@
"jg 50b \n"
"jmp 99f \n"
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
@@ -4936,13 +4741,13 @@
"jg 100b \n"
"99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+rm"(dst_width), // %2
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm5"
);
}
#endif // HAS_INTERPOLATEROW_SSSE3
@@ -4953,22 +4758,25 @@
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
+ "shr %3 \n"
"cmp $0x0,%3 \n"
"je 100f \n"
"sub %1,%0 \n"
- "cmp $0x80,%3 \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
"je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
"vmovd %3,%%xmm0 \n"
"neg %3 \n"
- "add $0x100,%3 \n"
+ "add $0x80,%3 \n"
"vmovd %3,%%xmm5 \n"
"vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
"vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
- "vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
- "vbroadcastss %%xmm4,%%ymm4 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
// General purpose row blend.
LABELALIGN
@@ -4977,14 +4785,10 @@
MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
"lea " MEMLEA(0x20,1) ",%1 \n"
@@ -4992,6 +4796,19 @@
"jg 1b \n"
"jmp 99f \n"
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
// Blend 50 / 50.
LABELALIGN
"50: \n"
@@ -5003,6 +4820,19 @@
"jg 50b \n"
"jmp 99f \n"
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
+ MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
+ MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
@@ -5014,19 +4844,130 @@
"999: \n"
: "+D"(dst_ptr), // %0
"+S"(src_ptr), // %1
- "+cm"(dst_width), // %2
+ "+c"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm5"
);
}
#endif // HAS_INTERPOLATEROW_AVX2
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", NACL_R14
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
asm volatile (
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
@@ -5043,7 +4984,7 @@
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "r"(shuffler) // %3
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
@@ -5054,7 +4995,7 @@
#ifdef HAS_ARGBSHUFFLEROW_AVX2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
asm volatile (
"vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
LABELALIGN
@@ -5072,7 +5013,7 @@
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "r"(shuffler) // %3
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
@@ -5083,8 +5024,8 @@
#ifdef HAS_ARGBSHUFFLEROW_SSE2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
- uintptr_t pixel_temp;
+ const uint8* shuffler, int pix) {
+ uintptr_t pixel_temp = 0u;
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
"mov " MEMACCESS(4) ",%k2 \n"
@@ -5189,11 +5130,11 @@
"jg 3012b \n"
"99: \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "=&d"(pixel_temp), // %2
- "+r"(width) // %3
- : "r"(shuffler) // %4
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+d"(pixel_temp), // %2
+ "+r"(pix) // %3
+ : "r"(shuffler) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm5"
);
@@ -5370,7 +5311,7 @@
// Tranform ARGB pixels with color table.
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
int width) {
- uintptr_t pixel_temp;
+ uintptr_t pixel_temp = 0u;
asm volatile (
// 1 pixel loop.
LABELALIGN
@@ -5390,10 +5331,10 @@
"mov %b1," MEMACCESS2(-0x1,0) " \n"
"dec %2 \n"
"jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
: "memory", "cc");
}
#endif // HAS_ARGBCOLORTABLEROW_X86
@@ -5401,7 +5342,7 @@
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
- uintptr_t pixel_temp;
+ uintptr_t pixel_temp = 0u;
asm volatile (
// 1 pixel loop.
LABELALIGN
@@ -5418,10 +5359,10 @@
"mov %b1," MEMACCESS2(-0x2,0) " \n"
"dec %2 \n"
"jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
: "memory", "cc");
}
#endif // HAS_RGBCOLORTABLEROW_X86
@@ -5431,8 +5372,8 @@
void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
int width,
const uint8* luma, uint32 lumacoeff) {
- uintptr_t pixel_temp;
- uintptr_t table_temp;
+ uintptr_t pixel_temp = 0u;
+ uintptr_t table_temp = 0u;
asm volatile (
"movd %6,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
@@ -5514,13 +5455,13 @@
"lea " MEMLEA(0x10,3) ",%3 \n"
"sub $0x4,%4 \n"
"jg 1b \n"
- : "=&d"(pixel_temp), // %0
- "=&a"(table_temp), // %1
- "+r"(src_argb), // %2
- "+r"(dst_argb), // %3
- "+rm"(width) // %4
- : "r"(luma), // %5
- "rm"(lumacoeff) // %6
+ : "+d"(pixel_temp), // %0
+ "+a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
: "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
);
}
--- a/third_party/libyuv/source/row_mips.cc
+++ b/third_party/libyuv/source/row_mips.cc
@@ -375,13 +375,13 @@
}
#endif // HAS_COPYROW_MIPS
-// DSPR2 functions
+// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -389,6 +389,7 @@
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
+ ".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
@@ -446,7 +447,7 @@
);
}
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -456,6 +457,7 @@
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
+ ".p2align 2 \n"
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
@@ -496,10 +498,10 @@
);
}
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
- int x;
- int y;
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ int x = 0;
+ int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -510,6 +512,7 @@
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
@@ -579,7 +582,7 @@
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
- [y] "=&r" (y)
+ [y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
@@ -593,7 +596,7 @@
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB \
+#define I422ToTransientMipsRGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
@@ -652,13 +655,11 @@
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -672,8 +673,9 @@
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
+ ".p2align 2 \n"
"1: \n"
- YUVTORGB
+ I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
@@ -715,10 +717,136 @@
);
}
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into abgr format
+ "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
+ "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
+ "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
+ "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
+
+ "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
+ "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
+ "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff \n"
+ "ori $s6, 0xff \n" // |00|ff|00|ff|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+ // Arranging into bgra format
+ "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
+ "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
+
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
+ "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
+ "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
+ "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
+ "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
+ "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
+ "sll $t1, $t1, 16 \n"
+ "sll $t2, $t2, 16 \n"
+ "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
+ "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
- ptrdiff_t src_stride, int dst_width,
- int source_y_fraction) {
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
@@ -729,6 +857,7 @@
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
--- a/third_party/libyuv/source/row_neon.cc
+++ b/third_party/libyuv/source/row_neon.cc
@@ -93,7 +93,7 @@
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
-#define YUVTORGB_SETUP \
+#define YUV422TORGB_SETUP_REG \
MEMACCESS([kUVToRB]) \
"vld1.8 {d24}, [%[kUVToRB]] \n" \
MEMACCESS([kUVToG]) \
@@ -107,7 +107,7 @@
MEMACCESS([kYToRgb]) \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
-#define YUVTORGB \
+#define YUV422TORGB \
"vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
"vmull.u8 q9, d2, d25 \n" /* u/v G component */\
"vmovl.u8 q0, d0 \n" /* Y */\
@@ -134,19 +134,52 @@
"vqshrun.s16 d22, q9, #6 \n" /* R */ \
"vqshrun.s16 d21, q0, #6 \n" /* G */
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV444
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
@@ -155,10 +188,10 @@
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -168,15 +201,15 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
@@ -185,61 +218,90 @@
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
+ READYUV411
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
+ "1: \n"
READYUV422
- YUVTORGB
- "subs %5, %5, #8 \n"
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d19, #255 \n"
MEMACCESS(3)
- "vld1.8 {d23}, [%3]! \n"
- MEMACCESS(4)
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
-void I411ToARGBRow_NEON(const uint8* src_y,
+void I422ToABGRRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+ uint8* dst_abgr,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
- READYUV411
- YUVTORGB
+ READYUV422
+ YUV422TORGB
"subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
@@ -246,12 +308,12 @@
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(dst_argb), // %3
+ "+r"(dst_abgr), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -261,15 +323,15 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB
+ "vmov.u8 d19, #255 \n"
MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
@@ -278,10 +340,10 @@
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -291,13 +353,13 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
@@ -307,33 +369,68 @@
"+r"(src_v), // %2
"+r"(dst_rgb24), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ MEMACCESS(3)
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
#define ARGBTORGB565 \
- "vshll.u8 q0, d22, #8 \n" /* R */ \
- "vshll.u8 q8, d21, #8 \n" /* G */ \
- "vshll.u8 q9, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #5 \n" /* RG */ \
- "vsri.16 q0, q9, #11 \n" /* RGB */
+ "vshr.u8 d20, d20, #3 \n" /* B */ \
+ "vshr.u8 d21, d21, #2 \n" /* G */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #11 \n" /* R */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q0, q0, q10 \n" /* BGR */
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
ARGBTORGB565
MEMACCESS(3)
@@ -344,10 +441,10 @@
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -354,25 +451,31 @@
}
#define ARGBTOARGB1555 \
- "vshll.u8 q0, d23, #8 \n" /* A */ \
- "vshll.u8 q8, d22, #8 \n" /* R */ \
- "vshll.u8 q9, d21, #8 \n" /* G */ \
- "vshll.u8 q10, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #1 \n" /* AR */ \
- "vsri.16 q0, q9, #6 \n" /* ARG */ \
- "vsri.16 q0, q10, #11 \n" /* ARGB */
+ "vshr.u8 q10, q10, #3 \n" /* B */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vshr.u8 d23, d23, #7 \n" /* A */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vmovl.u8 q11, d23 \n" /* A */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #10 \n" /* R */ \
+ "vshl.u16 q11, q11, #15 \n" /* A */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q1, q10, q11 \n" /* RA */ \
+ "vorr q0, q0, q1 \n" /* BGRA */
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB1555
@@ -384,10 +487,10 @@
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -406,14 +509,14 @@
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
"1: \n"
READYUV422
- YUVTORGB
+ YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB4444
@@ -425,10 +528,10 @@
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %5
+ [kUVToG]"r"(&kUVToG), // %6
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -438,12 +541,13 @@
uint8* dst_argb,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUV400
- YUVTORGB
+ YUV422TORGB
"subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
@@ -450,10 +554,10 @@
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -464,6 +568,7 @@
int width) {
asm volatile (
"vmov.u8 d23, #255 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n"
@@ -484,15 +589,15 @@
void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READNV12
- YUVTORGB
+ YUV422TORGB
"subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
@@ -500,10 +605,10 @@
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -510,28 +615,28 @@
}
void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
+ const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READNV21
- YUVTORGB
+ YUV422TORGB
"subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
- "+r"(src_vu), // %1
+ "+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -540,13 +645,13 @@
void NV12ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READNV12
- YUVTORGB
+ YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
@@ -556,26 +661,54 @@
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&kUVToRB), // %4
+ [kUVToG]"r"(&kUVToG), // %5
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READYUY2
- YUVTORGB
+ YUV422TORGB
"subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
@@ -582,10 +715,10 @@
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -593,15 +726,15 @@
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ YUV422TORGB_SETUP_REG
+ ".p2align 2 \n"
"1: \n"
READUYVY
- YUVTORGB
+ YUV422TORGB
"subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
@@ -608,10 +741,10 @@
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVToRB]"r"(&kUVToRB), // %3
+ [kUVToG]"r"(&kUVToG), // %4
+ [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@@ -621,6 +754,7 @@
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
@@ -643,6 +777,7 @@
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U
@@ -665,6 +800,7 @@
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
@@ -719,6 +855,7 @@
"add %0, %0, %2 \n"
"sub %0, #16 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
@@ -745,6 +882,7 @@
"add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
@@ -771,6 +909,7 @@
"add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
@@ -789,9 +928,10 @@
);
}
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
@@ -801,15 +941,16 @@
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@@ -820,30 +961,12 @@
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3" // Clobber List
- );
-}
-
#define RGB565TOARGB \
"vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
"vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
@@ -856,9 +979,10 @@
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@@ -869,7 +993,7 @@
"bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
@@ -903,9 +1027,10 @@
"vorr.u8 d1, d4, d6 \n" /* G */
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@@ -916,7 +1041,7 @@
"bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
@@ -933,9 +1058,10 @@
"vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@@ -946,14 +1072,15 @@
"bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
@@ -963,14 +1090,15 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
@@ -981,14 +1109,15 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
@@ -998,14 +1127,15 @@
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
);
}
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
@@ -1015,7 +1145,7 @@
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
);
@@ -1022,8 +1152,9 @@
}
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
@@ -1036,7 +1167,7 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
@@ -1043,8 +1174,9 @@
}
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
@@ -1057,7 +1189,7 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
@@ -1064,9 +1196,10 @@
}
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
@@ -1084,7 +1217,7 @@
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
);
@@ -1091,9 +1224,10 @@
}
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
@@ -1111,7 +1245,7 @@
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
);
@@ -1119,7 +1253,7 @@
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
@@ -1134,7 +1268,7 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "r"(shuffler) // %3
: "cc", "memory", "q0", "q1", "q2" // Clobber List
);
@@ -1145,6 +1279,7 @@
const uint8* src_v,
uint8* dst_yuy2, int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
@@ -1171,6 +1306,7 @@
const uint8* src_v,
uint8* dst_uyvy, int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
@@ -1192,8 +1328,9 @@
);
}
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
@@ -1204,7 +1341,7 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q8", "q9", "q10", "q11"
);
@@ -1213,6 +1350,7 @@
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
+ ".p2align 2 \n"
"vdup.32 d2, %2 \n" // dither4
"1: \n"
MEMACCESS(1)
@@ -1234,8 +1372,9 @@
}
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
- int width) {
+ int pix) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
@@ -1246,7 +1385,7 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q8", "q9", "q10", "q11"
);
@@ -1253,9 +1392,10 @@
}
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
- int width) {
+ int pix) {
asm volatile (
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
@@ -1266,18 +1406,19 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q8", "q9", "q10", "q11"
);
}
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -1292,35 +1433,18 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q12", "q13"
);
}
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -1334,7 +1458,7 @@
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q12", "q13"
);
@@ -1342,7 +1466,7 @@
// 8x1 pixels.
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
"vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
@@ -1350,6 +1474,7 @@
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -1375,15 +1500,65 @@
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
);
}
-// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
"vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
@@ -1391,6 +1566,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -1437,7 +1613,7 @@
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1444,7 +1620,7 @@
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
@@ -1459,7 +1635,7 @@
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1468,6 +1644,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -1499,7 +1676,7 @@
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1508,7 +1685,7 @@
// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
@@ -1517,6 +1694,7 @@
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -1548,7 +1726,7 @@
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1556,7 +1734,7 @@
}
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1565,6 +1743,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
@@ -1596,7 +1775,7 @@
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1604,7 +1783,7 @@
}
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1613,6 +1792,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
@@ -1644,7 +1824,7 @@
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1652,7 +1832,7 @@
}
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1661,6 +1841,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
@@ -1692,7 +1873,7 @@
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1700,7 +1881,7 @@
}
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1709,6 +1890,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
@@ -1740,7 +1922,7 @@
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1748,7 +1930,7 @@
}
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1757,6 +1939,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
@@ -1788,7 +1971,7 @@
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1795,9 +1978,9 @@
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1806,6 +1989,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@@ -1857,7 +2041,7 @@
"+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1864,9 +2048,9 @@
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1875,6 +2059,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@@ -1926,7 +2111,7 @@
"+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -1933,9 +2118,9 @@
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
@@ -1944,6 +2129,7 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@@ -1995,7 +2181,7 @@
"+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -2002,12 +2188,13 @@
);
}
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@@ -2023,18 +2210,19 @@
"bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
);
}
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@@ -2050,18 +2238,19 @@
"bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
);
}
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@@ -2077,18 +2266,19 @@
"bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
);
}
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
@@ -2103,18 +2293,19 @@
"bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
);
}
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
@@ -2129,18 +2320,19 @@
"bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
);
}
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
@@ -2155,18 +2347,19 @@
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
);
}
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
@@ -2181,18 +2374,19 @@
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
);
}
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile (
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
@@ -2207,7 +2401,7 @@
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
);
@@ -2217,13 +2411,16 @@
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
- int y1_fraction = source_y_fraction;
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
"cmp %4, #128 \n"
"beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
@@ -2246,6 +2443,20 @@
"bgt 1b \n"
"b 99f \n"
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
@@ -2259,6 +2470,20 @@
"bgt 50b \n"
"b 99f \n"
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
@@ -2273,7 +2498,7 @@
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
- "+r"(y1_fraction) // %4
+ "+r"(source_y_fraction) // %4
:
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
);
@@ -2380,6 +2605,7 @@
"vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
@@ -2422,6 +2648,7 @@
"vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
@@ -2457,6 +2684,7 @@
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -2493,6 +2721,7 @@
"vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
@@ -2531,6 +2760,7 @@
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
@@ -2583,11 +2813,14 @@
);
}
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -2614,6 +2847,7 @@
: "cc", "memory", "q0", "q1", "q2", "q3"
);
}
+#endif // HAS_ARGBMULTIPLYROW_NEON
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
@@ -2620,6 +2854,7 @@
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -2646,6 +2881,7 @@
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@@ -2677,6 +2913,7 @@
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
@@ -2703,6 +2940,7 @@
uint8* dst_y, int width) {
asm volatile (
// 16 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
@@ -2732,6 +2970,7 @@
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
@@ -2758,6 +2997,7 @@
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top
@@ -2801,6 +3041,7 @@
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -91,15 +91,17 @@
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
-#define YUVTORGB_SETUP \
+#define YUV422TORGB_SETUP_REG \
"ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
"ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+ "movi v27.8h, #128 \n" \
+ "movi v28.8h, #102 \n" \
+ "movi v29.8h, #25 \n" \
+ "movi v30.8h, #52 \n"
-#define YUVTORGB(vR, vG, vB) \
+#define YUV422TORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
"shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
"ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
@@ -127,19 +129,57 @@
"sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+
+#ifdef HAS_I444TOARGBROW_NEON
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV444
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
+ YUV422TORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -148,28 +188,27 @@
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I444TOARGBROW_NEON
+#ifdef HAS_I422TOARGBROW_NEON
void I422ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -178,61 +217,85 @@
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGBROW_NEON
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
+ READYUV411
+ YUV422TORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ "1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v21, v22, v23)
+ "subs %w4, %w4, #8 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- MEMACCESS(4)
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOBGRAROW_NEON
-void I411ToARGBRow_NEON(const uint8* src_y,
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+ uint8* dst_abgr,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
- READYUV411
- YUVTORGB(v22, v21, v20)
+ READYUV422
+ YUV422TORGB(v20, v21, v22)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -239,30 +302,29 @@
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(dst_argb), // %3
+ "+r"(dst_abgr), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOABGRROW_NEON
+#ifdef HAS_I422TORGBAROW_NEON
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v23, v22, v21)
+ YUV422TORGB(v23, v22, v21)
"subs %w4, %w4, #8 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -271,26 +333,25 @@
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGBAROW_NEON
+#ifdef HAS_I422TORGB24ROW_NEON
void I422ToRGB24Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
@@ -300,33 +361,60 @@
"+r"(src_v), // %2
"+r"(dst_rgb24), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGB24ROW_NEON
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ "1: \n"
+ READYUV422
+ YUV422TORGB(v20, v21, v22)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_I422TORAWROW_NEON
+
#define ARGBTORGB565 \
"shll v0.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
+#ifdef HAS_I422TORGB565ROW_NEON
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
ARGBTORGB565
MEMACCESS(3)
@@ -337,37 +425,36 @@
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGB565ROW_NEON
#define ARGBTOARGB1555 \
"shll v0.8h, v23.8b, #8 \n" /* A */ \
"shll v22.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
"sri v0.8h, v22.8h, #1 \n" /* AR */ \
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
+#ifdef HAS_I422TOARGB1555ROW_NEON
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB1555
MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
@@ -377,14 +464,13 @@
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGB1555ROW_NEON
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
@@ -396,18 +482,18 @@
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+#ifdef HAS_I422TOARGB4444ROW_NEON
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
@@ -419,40 +505,41 @@
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGB4444ROW_NEON
+#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV400
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I400TOARGBROW_NEON
+#ifdef HAS_J400TOARGBROW_NEON
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
@@ -474,19 +561,20 @@
: "cc", "memory", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_J400TOARGBROW_NEON
+#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
@@ -494,53 +582,51 @@
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV21TOARGBROW_NEON
void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
+ const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV21
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
- "+r"(src_vu), // %1
+ "+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV21TOARGBROW_NEON
+#ifdef HAS_NV12TORGB565ROW_NEON
void NV12ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
@@ -550,68 +636,95 @@
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV12TORGB565ROW_NEON
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ "1: \n"
+ READNV21
+ YUV422TORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUY2
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_YUY2TOARGBROW_NEON
+#ifdef HAS_UYVYTOARGBROW_NEON
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READUYVY
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_UYVYTOARGBROW_NEON
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
@@ -632,8 +745,10 @@
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_SPLITUVROW_NEON
// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
@@ -655,8 +770,10 @@
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_MERGEUVROW_NEON
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
"1: \n"
@@ -673,6 +790,7 @@
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_COPYROW_NEON
// SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
@@ -679,10 +797,10 @@
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v8) // %2
@@ -694,10 +812,10 @@
asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
@@ -705,15 +823,18 @@
);
}
+#ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
+ "add %0, %0, %2 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "subs %2, %2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@@ -722,22 +843,26 @@
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
}
+#endif // HAS_MIRRORROW_NEON
+#ifdef HAS_MIRRORUVROW_NEON
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
+ int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
+ "add %0, %0, %3, lsl #1 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
+ "subs %3, %3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
@@ -748,21 +873,25 @@
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(width64) // %3
: "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "v0", "v1"
);
}
+#endif // HAS_MIRRORUVROW_NEON
+#ifdef HAS_ARGBMIRRORROW_NEON
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ int64 width64 = (int64) width;
asm volatile (
- // Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
+ // Start at end of source row.
+ "add %0, %0, %2, lsl #2 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n"
MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@@ -771,13 +900,15 @@
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
}
+#endif // HAS_ARGBMIRRORROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"movi v4.8b, #255 \n" // Alpha
"1: \n"
@@ -789,13 +920,15 @@
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_RGB24TOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"movi v5.8b, #255 \n" // Alpha
"1: \n"
@@ -809,31 +942,13 @@
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
+#endif // HAS_RAWTOARGBROW_NEON
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- MEMACCESS(1)
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
#define RGB565TOARGB \
"shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
"shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
@@ -847,7 +962,8 @@
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
@@ -860,11 +976,12 @@
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
);
}
+#endif // HAS_RGB565TOARGBROW_NEON
#define ARGB1555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
@@ -903,8 +1020,9 @@
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */ \
+#ifdef HAS_ARGB1555TOARGBROW_NEON
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
@@ -917,11 +1035,12 @@
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_ARGB1555TOARGBROW_NEON
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
@@ -935,8 +1054,9 @@
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
+#ifdef HAS_ARGB4444TOARGBROW_NEON
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -948,13 +1068,15 @@
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_ARGB4444TOARGBROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -965,13 +1087,15 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_ARGBTORGB24ROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -984,13 +1108,15 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
+#endif // HAS_ARGBTORAWROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1001,13 +1127,15 @@
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_YUY2TOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1018,14 +1146,16 @@
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_UYVYTOYROW_NEON
+#ifdef HAS_YUY2TOUV422ROW_NEON
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1039,14 +1169,16 @@
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_YUY2TOUV422ROW_NEON
+#ifdef HAS_UYVYTOUV422ROW_NEON
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1060,14 +1192,16 @@
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_UYVYTOUV422ROW_NEON
+#ifdef HAS_YUY2TOUVROW_NEON
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
@@ -1087,15 +1221,17 @@
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
);
}
+#endif // HAS_YUY2TOUVROW_NEON
+#ifdef HAS_UYVYTOUVROW_NEON
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
@@ -1115,16 +1251,18 @@
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
);
}
+#endif // HAS_UYVYTOUVROW_NEON
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // shuffler
@@ -1138,12 +1276,14 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
+#endif // HAS_ARGBSHUFFLEROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1170,7 +1310,9 @@
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1197,8 +1339,10 @@
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_I422TOUYVYROW_NEON
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1210,12 +1354,14 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTORGB565ROW_NEON
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
@@ -1238,9 +1384,11 @@
: "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTORGB565ROW_NEON
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1252,14 +1400,16 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTOARGB1555ROW_NEON
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
- int width) {
+ int pix) {
asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
@@ -1272,13 +1422,15 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -1298,31 +1450,16 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBTOYROW_NEON
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient
@@ -1339,15 +1476,17 @@
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBTOYJROW_NEON
// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
"movi v25.8b, #74 \n" // UG -0.5781 coefficient
@@ -1380,24 +1519,62 @@
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v24", "v25", "v26", "v27", "v28", "v29"
);
}
+#endif // HAS_ARGBTOUV444ROW_NEON
-#define RGBTOUV_SETUP_REG \
- "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
- "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
- "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
- "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
- "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
- "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
-// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "mul v3.8h, v0.8h, v20.8h \n" // B
+ "mls v3.8h, v1.8h, v21.8h \n" // G
+ "mls v3.8h, v2.8h, v22.8h \n" // R
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+
+ "mul v4.8h, v2.8h, v20.8h \n" // R
+ "mls v4.8h, v1.8h, v24.8h \n" // G
+ "mls v4.8h, v0.8h, v23.8h \n" // B
+ "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+#endif // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
@@ -1439,14 +1616,15 @@
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUV411ROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
@@ -1462,8 +1640,9 @@
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
+#ifdef HAS_ARGBTOUVROW_NEON
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1495,16 +1674,18 @@
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUVROW_NEON
// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
@@ -1540,15 +1721,17 @@
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUVJROW_NEON
+#ifdef HAS_BGRATOUVROW_NEON
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1579,15 +1762,17 @@
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_BGRATOUVROW_NEON
+#ifdef HAS_ABGRTOUVROW_NEON
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1618,15 +1803,17 @@
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ABGRTOUVROW_NEON
+#ifdef HAS_RGBATOUVROW_NEON
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1657,15 +1844,17 @@
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RGBATOUVROW_NEON
+#ifdef HAS_RGB24TOUVROW_NEON
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1696,15 +1885,17 @@
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RGB24TOUVROW_NEON
+#ifdef HAS_RAWTOUVROW_NEON
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1735,16 +1926,18 @@
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RAWTOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile (
"movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
@@ -1808,7 +2001,7 @@
"+r"(src_rgb565_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
@@ -1815,10 +2008,12 @@
"v25", "v26", "v27"
);
}
+#endif // HAS_RGB565TOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1877,7 +2072,7 @@
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@@ -1884,10 +2079,12 @@
"v26", "v27", "v28"
);
}
+#endif // HAS_ARGB1555TOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1946,7 +2143,7 @@
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@@ -1954,8 +2151,10 @@
);
}
+#endif // HAS_ARGB4444TOUVROW_NEON
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
asm volatile (
"movi v24.8b, #13 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
@@ -1976,14 +2175,16 @@
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
"v24", "v25", "v26", "v27"
);
}
+#endif // HAS_RGB565TOYROW_NEON
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2004,13 +2205,15 @@
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGB1555TOYROW_NEON
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
asm volatile (
"movi v24.8b, #13 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
@@ -2031,13 +2234,15 @@
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
);
}
+#endif // HAS_ARGB4444TOYROW_NEON
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2057,13 +2262,15 @@
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_BGRATOYROW_NEON
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2083,13 +2290,15 @@
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_ABGRTOYROW_NEON
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2109,13 +2318,15 @@
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RGBATOYROW_NEON
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2135,13 +2346,15 @@
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RGB24TOYROW_NEON
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2161,13 +2374,15 @@
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RAWTOYROW_NEON
// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
@@ -2177,8 +2392,12 @@
asm volatile (
"cmp %w4, #0 \n"
"b.eq 100f \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
"cmp %w4, #128 \n"
"b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
"dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n"
@@ -2200,6 +2419,20 @@
"b.gt 1b \n"
"b 99f \n"
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
+
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
@@ -2213,6 +2446,20 @@
"b.gt 50b \n"
"b 99f \n"
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
+
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
@@ -2233,8 +2480,10 @@
: "cc", "memory", "v0", "v1", "v3", "v4", "v5"
);
}
+#endif // HAS_INTERPOLATEROW_NEON
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2303,8 +2552,10 @@
"v16", "v17", "v18"
);
}
+#endif // HAS_ARGBBLENDROW_NEON
// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
@@ -2328,9 +2579,11 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBATTENUATEROW_NEON
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
@@ -2370,10 +2623,12 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBQUANTIZEROW_NEON
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
@@ -2408,10 +2663,12 @@
: "cc", "memory", "v0", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBSHADEROW_NEON
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
@@ -2437,6 +2694,7 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
);
}
+#endif // HAS_ARGBGRAYROW_NEON
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
// b = (r * 35 + g * 68 + b * 17) >> 7
@@ -2443,6 +2701,7 @@
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
+#ifdef HAS_ARGBSEPIAROW_NEON
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
asm volatile (
"movi v20.8b, #17 \n" // BB coefficient
@@ -2480,10 +2739,12 @@
"v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
);
}
+#endif // HAS_ARGBSEPIAROW_NEON
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
@@ -2543,9 +2804,11 @@
"v18", "v19", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBCOLORMATRIXROW_NEON
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2576,8 +2839,10 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBMULTIPLYROW_NEON
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2604,8 +2869,10 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBADDROW_NEON
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2632,6 +2899,7 @@
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBSUBTRACTROW_NEON
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
@@ -2638,6 +2906,7 @@
// R = Sobel
// G = Sobel
// B = Sobel
+#ifdef HAS_SOBELROW_NEON
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
@@ -2663,8 +2932,10 @@
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_SOBELROW_NEON
// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
asm volatile (
@@ -2687,6 +2958,7 @@
: "cc", "memory", "v0", "v1"
);
}
+#endif // HAS_SOBELTOPLANEROW_NEON
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
@@ -2693,6 +2965,7 @@
// R = Sobel X
// G = Sobel
// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
@@ -2716,11 +2989,13 @@
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_SOBELXYROW_NEON
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
+#ifdef HAS_SOBELXROW_NEON
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
@@ -2759,11 +3034,13 @@
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_SOBELXROW_NEON
// SobelY as a matrix is
// -1 -2 -1
// 0 0 0
// 1 2 1
+#ifdef HAS_SOBELYROW_NEON
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
@@ -2801,6 +3078,7 @@
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_SOBELYROW_NEON
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -21,108 +21,183 @@
extern "C" {
#endif
-// This module is for Visual C 32/64 bit and clangcl 32 bit
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+ defined(_MSC_VER) && !defined(__clang__)
-// 64 bit
-#if defined(_M_X64)
+struct YuvConstants {
+ lvec8 kUVToB; // 0
+ lvec8 kUVToG; // 32
+ lvec8 kUVToR; // 64
+ lvec16 kUVBiasB; // 96
+ lvec16 kUVBiasG; // 128
+ lvec16 kUVBiasR; // 160
+ lvec16 kYToRgb; // 192
+};
-// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8;
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
-// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
- a_buf += 8;
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
- xmm2 = _mm_packus_epi16(xmm2, xmm2);
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
-// Store 8 ARGB values.
-#define STOREARGB \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
- xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
- _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
- _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
- dst_argb += 32;
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+ { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+ { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+ { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+ { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414 * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+ { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+ { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+ { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+ { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+ { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+ { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+ { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// 64 bit
+#if defined(_M_X64)
#if defined(HAS_I422TOARGBROW_SSSE3)
void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
- while (width > 0) {
- READYUV422
- YUVTORGB(yuvconstants)
- STOREARGB
- width -= 8;
- }
-}
-#endif
-#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
while (width > 0) {
- READYUVA422
- YUVTORGB(yuvconstants)
- STOREARGB
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+ xmm1 = _mm_loadu_si128(&xmm0);
+ xmm2 = _mm_loadu_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+ xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+ xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+ xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+ xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
+ xmm0 = _mm_adds_epi16(xmm0, xmm3);
+ xmm1 = _mm_adds_epi16(xmm1, xmm3);
+ xmm2 = _mm_adds_epi16(xmm2, xmm3);
+ xmm0 = _mm_srai_epi16(xmm0, 6);
+ xmm1 = _mm_srai_epi16(xmm1, 6);
+ xmm2 = _mm_srai_epi16(xmm2, 6);
+ xmm0 = _mm_packus_epi16(xmm0, xmm0);
+ xmm1 = _mm_packus_epi16(xmm1, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+ xmm1 = _mm_loadu_si128(&xmm0);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+ _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+ _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+ y_buf += 8;
+ u_buf += 4;
+ dst_argb += 32;
width -= 8;
}
}
#endif
-
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -226,24 +301,6 @@
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-// Shuffle table for converting RAW to RGB24. First 8.
-static const uvec8 kShuffleMaskRAWToRGB24_0 = {
- 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24. Middle 8.
-static const uvec8 kShuffleMaskRAWToRGB24_1 = {
- 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
-// Shuffle table for converting RAW to RGB24. Last 8.
-static const uvec8 kShuffleMaskRAWToRGB24_2 = {
- 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
// Shuffle table for converting ARGB to RGB24.
static const uvec8 kShuffleMaskARGBToRGB24 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
@@ -259,43 +316,18 @@
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
};
-// YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
-// YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
- 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
-
-// UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
-
-// UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
- 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
-
-// NV21 shuf 8 VU to 16 UV.
-static const lvec8 kShuffleNV21 = {
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
- 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-};
-
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
@@ -320,11 +352,11 @@
#ifdef HAS_J400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_y
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
@@ -350,14 +382,14 @@
#endif // HAS_J400TOARGBROW_AVX2
__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
__asm {
mov eax, [esp + 4] // src_rgb24
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
+ movdqa xmm4, kShuffleMaskRGB24ToARGB
convertloop:
movdqu xmm0, [eax]
@@ -389,14 +421,14 @@
__declspec(naked)
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, [esp + 4] // src_raw
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
+ movdqa xmm4, kShuffleMaskRAWToARGB
convertloop:
movdqu xmm0, [eax]
@@ -426,34 +458,6 @@
}
}
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
- __asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_rgb24
- mov ecx, [esp + 12] // width
- movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
- movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
- movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
-
- convertloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 4]
- movdqu xmm2, [eax + 8]
- lea eax, [eax + 24]
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 8
- jg convertloop
- ret
- }
-}
-
// pmul method to replicate bits.
// Math to replicate bits:
// (v << 8) | (v << 3)
@@ -463,7 +467,7 @@
// 20 instructions.
__declspec(naked)
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
@@ -481,7 +485,7 @@
mov eax, [esp + 4] // src_rgb565
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -519,13 +523,13 @@
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
__declspec(naked)
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
vmovd xmm5, eax
vbroadcastss ymm5, xmm5
mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
- vmovd xmm6, eax
+ movd xmm6, eax
vbroadcastss ymm6, xmm6
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
@@ -537,7 +541,7 @@
mov eax, [esp + 4] // src_rgb565
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -570,13 +574,13 @@
#ifdef HAS_ARGB1555TOARGBROW_AVX2
__declspec(naked)
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
vmovd xmm5, eax
vbroadcastss ymm5, xmm5
mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
- vmovd xmm6, eax
+ movd xmm6, eax
vbroadcastss ymm6, xmm6
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
vpsllw ymm3, ymm3, 11
@@ -586,7 +590,7 @@
mov eax, [esp + 4] // src_argb1555
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -622,7 +626,7 @@
#ifdef HAS_ARGB4444TOARGBROW_AVX2
__declspec(naked)
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
vmovd xmm4, eax
@@ -630,7 +634,7 @@
vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
mov eax, [esp + 4] // src_argb4444
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -660,7 +664,7 @@
// 24 instructions
__declspec(naked)
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
movd xmm5, eax
@@ -677,7 +681,7 @@
mov eax, [esp + 4] // src_argb1555
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -713,7 +717,7 @@
// 18 instructions.
__declspec(naked)
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+ int pix) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
movd xmm4, eax
@@ -722,7 +726,7 @@
pslld xmm5, 4
mov eax, [esp + 4] // src_argb4444
mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
sub edx, eax
sub edx, eax
@@ -750,12 +754,12 @@
}
__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRGB24
convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb
@@ -788,12 +792,12 @@
}
__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRAW
convertloop:
movdqu xmm0, [eax] // fetch 16 pixels of argb
@@ -825,12 +829,13 @@
}
}
+// 4 pixels
__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
psrld xmm3, 27
pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
@@ -862,15 +867,16 @@
}
}
+// 8 pixels
__declspec(naked)
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+ const uint32 dither4, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
movd xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
+ mov ecx, [esp + 16] // pix
punpcklbw xmm6, xmm6 // make dither 16 bytes
movdqa xmm7, xmm6
punpcklwd xmm6, xmm6
@@ -910,12 +916,12 @@
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
__declspec(naked)
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) {
+ const uint32 dither4, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
vbroadcastss xmm6, [esp + 12] // dither4
- mov ecx, [esp + 16] // width
+ mov ecx, [esp + 16] // pix
vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
vpermq ymm6, ymm6, 0xd8
vpunpcklwd ymm6, ymm6, ymm6
@@ -952,11 +958,11 @@
// TODO(fbarchard): Improve sign extension/packing.
__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
psrld xmm4, 27
movdqa xmm5, xmm4 // generate mask 0x000003e0
@@ -993,11 +999,11 @@
}
__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
psllw xmm4, 12
movdqa xmm3, xmm4 // generate mask 0x00f000f0
@@ -1023,11 +1029,11 @@
#ifdef HAS_ARGBTORGB565ROW_AVX2
__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
vpsrld ymm3, ymm3, 27
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
@@ -1060,11 +1066,11 @@
#ifdef HAS_ARGBTOARGB1555ROW_AVX2
__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
vpcmpeqb ymm4, ymm4, ymm4
vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
@@ -1100,11 +1106,11 @@
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
vpsllw ymm4, ymm4, 12
vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
@@ -1131,13 +1137,13 @@
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToY
- movdqa xmm5, xmmword ptr kAddY16
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToY
+ movdqa xmm5, kAddY16
convertloop:
movdqu xmm0, [eax]
@@ -1166,13 +1172,13 @@
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
convertloop:
movdqu xmm0, [eax]
@@ -1207,14 +1213,14 @@
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToY
- vbroadcastf128 ymm5, xmmword ptr kAddY16
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+ mov ecx, [esp + 12] /* pix */
+ vbroadcastf128 ymm4, kARGBToY
+ vbroadcastf128 ymm5, kAddY16
+ vmovdqu ymm6, kPermdARGBToY_AVX
convertloop:
vmovdqu ymm0, [eax]
@@ -1246,14 +1252,14 @@
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
- vbroadcastf128 ymm5, xmmword ptr kAddYJ64
- vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+ mov ecx, [esp + 12] /* pix */
+ vbroadcastf128 ymm4, kARGBToYJ
+ vbroadcastf128 ymm5, kAddYJ64
+ vmovdqu ymm6, kPermdARGBToY_AVX
convertloop:
vmovdqu ymm0, [eax]
@@ -1285,13 +1291,13 @@
#endif // HAS_ARGBTOYJROW_AVX2
__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kBGRAToY
- movdqa xmm5, xmmword ptr kAddY16
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kBGRAToY
+ movdqa xmm5, kAddY16
convertloop:
movdqu xmm0, [eax]
@@ -1318,13 +1324,13 @@
}
__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kABGRToY
- movdqa xmm5, xmmword ptr kAddY16
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kABGRToY
+ movdqa xmm5, kAddY16
convertloop:
movdqu xmm0, [eax]
@@ -1351,13 +1357,13 @@
}
__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
- mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kRGBAToY
- movdqa xmm5, xmmword ptr kAddY16
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kRGBAToY
+ movdqa xmm5, kAddY16
convertloop:
movdqu xmm0, [eax]
@@ -1393,10 +1399,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kARGBToV
+ movdqa xmm7, kARGBToU
sub edi, edx // stride from u to v
convertloop:
@@ -1463,10 +1469,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
- movdqa xmm6, xmmword ptr kARGBToVJ
- movdqa xmm7, xmmword ptr kARGBToUJ
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm5, kAddUVJ128
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm7, kARGBToUJ
sub edi, edx // stride from u to v
convertloop:
@@ -1505,7 +1511,7 @@
pmaddubsw xmm3, xmm6
phaddw xmm0, xmm2
phaddw xmm1, xmm3
- paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
paddw xmm1, xmm5
psraw xmm0, 8
psraw xmm1, 8
@@ -1535,10 +1541,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ mov ecx, [esp + 8 + 20] // pix
+ vbroadcastf128 ymm5, kAddUV128
+ vbroadcastf128 ymm6, kARGBToV
+ vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v
convertloop:
@@ -1572,7 +1578,7 @@
vpsraw ymm0, ymm0, 8
vpacksswb ymm0, ymm1, ymm0 // mutates
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+ vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
@@ -1590,74 +1596,7 @@
}
#endif // HAS_ARGBTOUVROW_AVX2
-#ifdef HAS_ARGBTOUVJROW_AVX2
__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- mov esi, [esp + 8 + 8] // src_stride_argb
- mov edx, [esp + 8 + 12] // dst_u
- mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
- sub edi, edx // stride from u to v
-
- convertloop:
- /* step 1 - subsample 32x2 argb pixels to 16x1 */
- vmovdqu ymm0, [eax]
- vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + 64]
- vmovdqu ymm3, [eax + 96]
- vpavgb ymm0, ymm0, [eax + esi]
- vpavgb ymm1, ymm1, [eax + esi + 32]
- vpavgb ymm2, ymm2, [eax + esi + 64]
- vpavgb ymm3, ymm3, [eax + esi + 96]
- lea eax, [eax + 128]
- vshufps ymm4, ymm0, ymm1, 0x88
- vshufps ymm0, ymm0, ymm1, 0xdd
- vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
- vshufps ymm4, ymm2, ymm3, 0x88
- vshufps ymm2, ymm2, ymm3, 0xdd
- vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
-
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
- vpmaddubsw ymm1, ymm0, ymm7 // U
- vpmaddubsw ymm3, ymm2, ymm7
- vpmaddubsw ymm0, ymm0, ymm6 // V
- vpmaddubsw ymm2, ymm2, ymm6
- vphaddw ymm1, ymm1, ymm3 // mutates
- vphaddw ymm0, ymm0, ymm2
- vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
- vpaddw ymm0, ymm0, ymm5
- vpsraw ymm1, ymm1, 8
- vpsraw ymm0, ymm0, 8
- vpacksswb ymm0, ymm1, ymm0 // mutates
- vpermq ymm0, ymm0, 0xd8 // For vpacksswb
- vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
-
- // step 3 - store 16 U and 16 V values
- vextractf128 [edx], ymm0, 0 // U
- vextractf128 [edx + edi], ymm0, 1 // V
- lea edx, [edx + 16]
- sub ecx, 32
- jg convertloop
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_ARGBTOUVJROW_AVX2
-
-__declspec(naked)
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1665,10 +1604,10 @@
mov eax, [esp + 4 + 4] // src_argb
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kARGBToV
- movdqa xmm7, xmmword ptr kARGBToU
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kARGBToV
+ movdqa xmm7, kARGBToU
sub edi, edx // stride from u to v
convertloop:
@@ -1715,6 +1654,64 @@
}
__declspec(naked)
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kARGBToV
+ movdqa xmm7, kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked)
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
@@ -1724,10 +1721,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kBGRAToV
- movdqa xmm7, xmmword ptr kBGRAToU
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kBGRAToV
+ movdqa xmm7, kBGRAToU
sub edi, edx // stride from u to v
convertloop:
@@ -1794,10 +1791,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kABGRToV
- movdqa xmm7, xmmword ptr kABGRToU
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kABGRToV
+ movdqa xmm7, kABGRToU
sub edi, edx // stride from u to v
convertloop:
@@ -1864,10 +1861,10 @@
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
- movdqa xmm6, xmmword ptr kRGBAToV
- movdqa xmm7, xmmword ptr kRGBAToU
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm5, kAddUV128
+ movdqa xmm6, kRGBAToV
+ movdqa xmm7, kRGBAToU
sub edi, edx // stride from u to v
convertloop:
@@ -1927,62 +1924,33 @@
// Read 16 UV from 444
#define READYUV444_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
}
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
__asm lea esi, [esi + 8] \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
}
-// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
- __asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
- __asm vpermq ymm5, ymm5, 0xd8 \
- __asm lea ebp, [ebp + 16] \
- }
-
// Read 4 UV from 411, upsample to 16 UV.
#define READYUV411_AVX2 __asm { \
- __asm vmovd xmm0, dword ptr [esi] /* U */ \
- __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
+ __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
+ __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
__asm lea esi, [esi + 4] \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
}
// Read 8 UV from NV12, upsample to 16 UV.
@@ -1991,58 +1959,29 @@
__asm lea esi, [esi + 16] \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
}
-// Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
- __asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
- __asm vpermq ymm4, ymm4, 0xd8 \
- __asm vpunpcklbw ymm4, ymm4, ymm4 \
- __asm lea eax, [eax + 16] \
- }
-
-// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 32] \
- }
-
-// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
- __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 32] \
- }
-
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
+ /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
+ __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \
+ __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \
+ __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasR \
__asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasG \
__asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
+ __asm vmovdqu ymm3, YuvConstants.kUVBiasB \
__asm vpsubw ymm0, ymm3, ymm0 \
/* Step 2: Find Y contribution to 16 R,G,B values */ \
- __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vmovdqu xmm3, [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 16] \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklbw ymm3, ymm3, ymm3 \
+ __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \
+ __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
@@ -2053,6 +1992,7 @@
// Store 16 ARGB values.
#define STOREARGB_AVX2 __asm { \
+ /* Step 3: Weave into ARGB */ \
__asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \
__asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
@@ -2064,19 +2004,6 @@
__asm lea edx, [edx + 64] \
}
-// Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
- __asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
- __asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
- __asm vmovdqu [edx], ymm0 \
- __asm vmovdqu [edx + 32], ymm1 \
- __asm lea edx, [edx + 64] \
- }
-
#ifdef HAS_I422TOARGBROW_AVX2
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2085,30 +2012,26 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
+ YUVTORGB_AVX2(kYuvConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebx
pop edi
pop esi
vzeroupper
@@ -2117,41 +2040,34 @@
}
#endif // HAS_I422TOARGBROW_AVX2
-#ifdef HAS_I422ALPHATOARGBROW_AVX2
+#ifdef HAS_J422TOARGBROW_AVX2
// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
- READYUVA422_AVX2
- YUVTORGB_AVX2(ebx)
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvJConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebp
- pop ebx
pop edi
pop esi
vzeroupper
@@ -2158,7 +2074,7 @@
ret
}
}
-#endif // HAS_I422ALPHATOARGBROW_AVX2
+#endif // HAS_J422TOARGBROW_AVX2
#ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels
@@ -2168,29 +2084,26 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
convertloop:
READYUV444_AVX2
- YUVTORGB_AVX2(ebx)
+ YUVTORGB_AVX2(kYuvConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebx
pop edi
pop esi
vzeroupper
@@ -2207,30 +2120,26 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV411_AVX2
- YUVTORGB_AVX2(ebx)
+ YUVTORGB_AVX2(kYuvConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebx
pop edi
pop esi
vzeroupper
@@ -2246,27 +2155,23 @@
void NV12ToARGBRow_AVX2(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READNV12_AVX2
- YUVTORGB_AVX2(ebx)
+ YUVTORGB_AVX2(kYuvConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebx
pop esi
vzeroupper
ret
@@ -2276,32 +2181,28 @@
#ifdef HAS_NV21TOARGBROW_AVX2
// 16 pixels.
-// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked)
void NV21ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* vu_buf,
+ const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
- READNV21_AVX2
- YUVTORGB_AVX2(ebx)
+ READNV12_AVX2
+ YUVTORGB_AVX2(kYvuConstants)
STOREARGB_AVX2
sub ecx, 16
jg convertloop
- pop ebx
pop esi
vzeroupper
ret
@@ -2309,100 +2210,136 @@
}
#endif // HAS_NV21TOARGBROW_AVX2
-#ifdef HAS_YUY2TOARGBROW_AVX2
-// 16 pixels.
-// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
- READYUY2_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ // Step 3: Weave into BGRA
+ vpunpcklbw ymm1, ymm1, ymm0 // GB
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm5, ymm2 // AR
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
+ vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
sub ecx, 16
jg convertloop
- pop ebx
+ pop edi
+ pop esi
vzeroupper
ret
}
}
-#endif // HAS_YUY2TOARGBROW_AVX2
+#endif // HAS_I422TOBGRAROW_AVX2
-#ifdef HAS_UYVYTOARGBROW_AVX2
-// 16 pixels.
-// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
- READUYVY_AVX2
- YUVTORGB_AVX2(ebx)
- STOREARGB_AVX2
+ READYUV422_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ // Step 3: Weave into RGBA
+ vpunpcklbw ymm1, ymm1, ymm2 // GR
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm5, ymm0 // AB
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
+ vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
sub ecx, 16
jg convertloop
- pop ebx
+ pop edi
+ pop esi
vzeroupper
ret
}
}
-#endif // HAS_UYVYTOARGBROW_AVX2
+#endif // HAS_I422TORGBAROW_AVX2
-#ifdef HAS_I422TORGBAROW_AVX2
+#ifdef HAS_I422TOABGRROW_AVX2
// 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
+void I422ToABGRRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // abgr
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
convertloop:
READYUV422_AVX2
- YUVTORGB_AVX2(ebx)
- STORERGBA_AVX2
+ YUVTORGB_AVX2(kYuvConstants)
+ // Step 3: Weave into ABGR
+ vpunpcklbw ymm1, ymm2, ymm1 // RG
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklbw ymm2, ymm0, ymm5 // BA
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
sub ecx, 16
jg convertloop
- pop ebx
pop edi
pop esi
vzeroupper
@@ -2409,21 +2346,17 @@
ret
}
}
-#endif // HAS_I422TORGBAROW_AVX2
+#endif // HAS_I422TOABGRROW_AVX2
#if defined(HAS_I422TOARGBROW_SSSE3)
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Allows a conversion with half size scaling.
// Read 8 UV from 444.
#define READYUV444 __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
- __asm movq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
__asm lea esi, [esi + 8] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
}
// Read 4 UV from 422, upsample to 8 UV.
@@ -2433,99 +2366,50 @@
__asm lea esi, [esi + 4] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
}
-// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
- __asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
- __asm lea ebp, [ebp + 8] \
- }
-
// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-// __asm pinsrw xmm0, [esi], 0 /* U */
-// __asm pinsrw xmm1, [esi + edi], 0 /* V */
-#define READYUV411_EBX __asm { \
- __asm movzx ebx, word ptr [esi] /* U */ \
+#define READYUV411 __asm { \
+ __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
__asm movd xmm0, ebx \
- __asm movzx ebx, word ptr [esi + edi] /* V */ \
+ __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
__asm movd xmm1, ebx \
__asm lea esi, [esi + 2] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
}
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
__asm lea esi, [esi + 8] \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
}
-// Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
- __asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
- __asm movq xmm4, qword ptr [eax] \
- __asm punpcklbw xmm4, xmm4 \
- __asm lea eax, [eax + 8] \
- }
-
-// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
- __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
- __asm lea eax, [eax + 16] \
- }
-
-// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
- __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
- __asm lea eax, [eax + 16] \
- }
-
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
__asm movdqa xmm1, xmm0 \
__asm movdqa xmm2, xmm0 \
__asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \
+ __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \
__asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm movdqa xmm1, YuvConstants.kUVBiasG \
+ __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \
__asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm movdqa xmm2, YuvConstants.kUVBiasR \
+ __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \
__asm psubw xmm2, xmm3 \
- __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm3 \
+ __asm pmulhuw xmm3, YuvConstants.kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
@@ -2536,6 +2420,7 @@
// Store 8 ARGB values.
#define STOREARGB __asm { \
+ /* Step 3: Weave into ARGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
@@ -2548,6 +2433,7 @@
// Store 8 BGRA values.
#define STOREBGRA __asm { \
+ /* Step 3: Weave into BGRA */ \
__asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
__asm punpcklbw xmm1, xmm0 /* GB */ \
__asm punpcklbw xmm5, xmm2 /* AR */ \
@@ -2559,8 +2445,22 @@
__asm lea edx, [edx + 32] \
}
+// Store 8 ABGR values.
+#define STOREABGR __asm { \
+ /* Step 3: Weave into ABGR */ \
+ __asm punpcklbw xmm2, xmm1 /* RG */ \
+ __asm punpcklbw xmm0, xmm5 /* BA */ \
+ __asm movdqa xmm1, xmm2 \
+ __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm2 \
+ __asm movdqu 16[edx], xmm1 \
+ __asm lea edx, [edx + 32] \
+ }
+
// Store 8 RGBA values.
#define STORERGBA __asm { \
+ /* Step 3: Weave into RGBA */ \
__asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
__asm punpcklbw xmm1, xmm2 /* GR */ \
__asm punpcklbw xmm5, xmm0 /* AB */ \
@@ -2574,13 +2474,13 @@
// Store 8 RGB24 values.
#define STORERGB24 __asm { \
- /* Weave into RRGB */ \
+ /* Step 3: Weave into RRGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB24 */ \
+ /* Step 4: RRGB -> RGB24 */ \
__asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
__asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
__asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
@@ -2589,15 +2489,32 @@
__asm lea edx, [edx + 24] \
}
+// Store 8 RAW values.
+#define STORERAW __asm { \
+ /* Step 3: Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
+ /* Step 4: RRGB -> RAW */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24] \
+ }
+
// Store 8 RGB565 values.
#define STORERGB565 __asm { \
- /* Weave into RRGB */ \
+ /* Step 3: Weave into RRGB */ \
__asm punpcklbw xmm0, xmm1 /* BG */ \
__asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
__asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
__asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
- /* RRGB -> RGB565 */ \
+ /* Step 4: RRGB -> RGB565 */ \
__asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
__asm movdqa xmm2, xmm0 /* G */ \
__asm pslld xmm0, 8 /* R */ \
@@ -2632,30 +2549,26 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV444
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebx
pop edi
pop esi
ret
@@ -2669,31 +2582,27 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb24
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
- movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
- movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+ movdqa xmm5, kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, kShuffleMaskARGBToRGB24
convertloop:
READYUV422
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STORERGB24
sub ecx, 8
jg convertloop
- pop ebx
pop edi
pop esi
ret
@@ -2700,6 +2609,40 @@
}
}
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
+__declspec(naked)
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // raw
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ movdqa xmm5, kShuffleMaskARGBToRAW_0
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ STORERAW
+
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked)
@@ -2707,18 +2650,15 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb565_buf,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb565
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
psrld xmm5, 27
@@ -2730,13 +2670,12 @@
convertloop:
READYUV422
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STORERGB565
sub ecx, 8
jg convertloop
- pop ebx
pop edi
pop esi
ret
@@ -2750,30 +2689,26 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READYUV422
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebx
pop edi
pop esi
ret
@@ -2781,39 +2716,33 @@
}
// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov ebp, [esp + 16 + 16] // A
- mov edx, [esp + 16 + 20] // argb
- mov ebx, [esp + 16 + 24] // yuvconstants
- mov ecx, [esp + 16 + 28] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
- READYUVA422
- YUVTORGB(ebx)
+ READYUV422
+ YUVTORGB(kYuvJConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebp
- pop ebx
pop edi
pop esi
ret
@@ -2828,34 +2757,30 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
+ push ebx
push esi
push edi
- push ebx
- push ebp
- mov eax, [esp + 16 + 4] // Y
- mov esi, [esp + 16 + 8] // U
- mov edi, [esp + 16 + 12] // V
- mov edx, [esp + 16 + 16] // abgr
- mov ebp, [esp + 16 + 20] // yuvconstants
- mov ecx, [esp + 16 + 24] // width
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ecx, [esp + 12 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
- READYUV411_EBX
- YUVTORGB(ebp)
+ READYUV411 // modifies EBX
+ YUVTORGB(kYuvConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebp
- pop ebx
pop edi
pop esi
+ pop ebx
ret
}
}
@@ -2866,27 +2791,23 @@
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // UV
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
READNV12
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebx
pop esi
ret
}
@@ -2893,89 +2814,90 @@
}
// 8 pixels.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked)
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
+ const uint8* uv_buf,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
- push ebx
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // VU
- mov edx, [esp + 8 + 12] // argb
- mov ebx, [esp + 8 + 16] // yuvconstants
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
- READNV21
- YUVTORGB(ebx)
+ READNV12
+ YUVTORGB(kYvuConstants)
STOREARGB
sub ecx, 8
jg convertloop
- pop ebx
pop esi
ret
}
}
-// 8 pixels.
-// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
int width) {
__asm {
- push ebx
- mov eax, [esp + 4 + 4] // yuy2
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
convertloop:
- READYUY2
- YUVTORGB(ebx)
- STOREARGB
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ STOREBGRA
sub ecx, 8
jg convertloop
- pop ebx
+ pop edi
+ pop esi
ret
}
}
-// 8 pixels.
-// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
int width) {
__asm {
- push ebx
- mov eax, [esp + 4 + 4] // uyvy
- mov edx, [esp + 4 + 8] // argb
- mov ebx, [esp + 4 + 12] // yuvconstants
- mov ecx, [esp + 4 + 16] // width
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
convertloop:
- READUYVY
- YUVTORGB(ebx)
- STOREARGB
+ READYUV422
+ YUVTORGB(kYuvConstants)
+ STOREABGR
sub ecx, 8
jg convertloop
- pop ebx
+ pop edi
+ pop esi
ret
}
}
@@ -2985,34 +2907,31 @@
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width) {
__asm {
push esi
push edi
- push ebx
- mov eax, [esp + 12 + 4] // Y
- mov esi, [esp + 12 + 8] // U
- mov edi, [esp + 12 + 12] // V
- mov edx, [esp + 12 + 16] // argb
- mov ebx, [esp + 12 + 20] // yuvconstants
- mov ecx, [esp + 12 + 24] // width
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
sub edi, esi
convertloop:
READYUV422
- YUVTORGB(ebx)
+ YUVTORGB(kYuvConstants)
STORERGBA
sub ecx, 8
jg convertloop
- pop ebx
pop edi
pop esi
ret
}
}
+
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_I400TOARGBROW_SSE2
@@ -3126,7 +3045,7 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- movdqa xmm5, xmmword ptr kShuffleMirror
+ movdqa xmm5, kShuffleMirror
convertloop:
movdqu xmm0, [eax - 16 + ecx]
@@ -3147,7 +3066,7 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
+ vbroadcastf128 ymm5, kShuffleMirror
convertloop:
vmovdqu ymm0, [eax - 32 + ecx]
@@ -3163,7 +3082,33 @@
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORROW_SSE2
+__declspec(naked)
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ movdqu xmm0, [eax - 16 + ecx]
+ movdqa xmm1, xmm0 // swap bytes
+ psllw xmm0, 8
+ psrlw xmm1, 8
+ por xmm0, xmm1
+ pshuflw xmm0, xmm0, 0x1b // swap words
+ pshufhw xmm0, xmm0, 0x1b
+ pshufd xmm0, xmm0, 0x4e // swap qwords
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
@@ -3178,7 +3123,7 @@
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
- movdqa xmm1, xmmword ptr kShuffleMirrorUV
+ movdqa xmm1, kShuffleMirrorUV
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
@@ -3196,7 +3141,7 @@
ret
}
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORROW_UV_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked)
@@ -3232,7 +3177,7 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+ vmovdqu ymm5, kARGBShuffleMirror_AVX2
convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
@@ -3248,14 +3193,13 @@
#ifdef HAS_SPLITUVROW_SSE2
__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -3287,14 +3231,13 @@
#ifdef HAS_SPLITUVROW_AVX2
__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
- int width) {
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_uv
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3396,23 +3339,8 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- test eax, 15
- jne convertloopu
- test edx, 15
- jne convertloopu
- convertloopa:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa [edx], xmm0
- movdqa [edx + 16], xmm1
- lea edx, [edx + 32]
- sub ecx, 32
- jg convertloopa
- ret
-
- convertloopu:
+ convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
@@ -3420,7 +3348,7 @@
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
- jg convertloopu
+ jg convertloop
ret
}
}
@@ -3532,33 +3460,6 @@
}
#endif // HAS_ARGBCOPYALPHAROW_AVX2
-#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
-// width in pixels
-__declspec(naked)
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_a
- mov ecx, [esp + 12] // width
-
- extractloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrld xmm0, 24
- psrld xmm1, 24
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- sub ecx, 8
- jg extractloop
-
- ret
- }
-}
-#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
-
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
__declspec(naked)
@@ -3678,11 +3579,12 @@
#ifdef HAS_YUY2TOYROW_AVX2
__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
@@ -3705,7 +3607,7 @@
__declspec(naked)
void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -3713,7 +3615,7 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
+ mov ecx, [esp + 8 + 20] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3749,13 +3651,13 @@
__declspec(naked)
void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3788,11 +3690,11 @@
__declspec(naked)
void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
+ uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
convertloop:
vmovdqu ymm0, [eax]
@@ -3813,7 +3715,7 @@
__declspec(naked)
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -3821,7 +3723,7 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
+ mov ecx, [esp + 8 + 20] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3857,13 +3759,13 @@
__declspec(naked)
void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
sub edi, edx
@@ -3898,11 +3800,11 @@
#ifdef HAS_YUY2TOYROW_SSE2
__declspec(naked)
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y, int width) {
+ uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_yuy2
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
@@ -3923,7 +3825,7 @@
__declspec(naked)
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -3931,7 +3833,7 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
+ mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -3966,13 +3868,13 @@
__declspec(naked)
void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -4002,11 +3904,11 @@
__declspec(naked)
void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y, int width) {
+ uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_uyvy
mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // width
+ mov ecx, [esp + 12] // pix
convertloop:
movdqu xmm0, [eax]
@@ -4025,7 +3927,7 @@
__declspec(naked)
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push esi
push edi
@@ -4033,7 +3935,7 @@
mov esi, [esp + 8 + 8] // stride_yuy2
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
- mov ecx, [esp + 8 + 20] // width
+ mov ecx, [esp + 8 + 20] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -4068,13 +3970,13 @@
__declspec(naked)
void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src_yuy2
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
- mov ecx, [esp + 4 + 16] // width
+ mov ecx, [esp + 4 + 16] // pix
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
sub edi, edx
@@ -4103,123 +4005,93 @@
}
#endif // HAS_YUY2TOYROW_SSE2
-#ifdef HAS_BLENDPLANEROW_SSSE3
+#ifdef HAS_ARGBBLENDROW_SSE2
// Blend 8 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
__asm {
push esi
- push edi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
psllw xmm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm6, eax
- pshufd xmm6, xmm6, 0x00
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- movd xmm7, eax
- pshufd xmm7, xmm7, 0x00
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
- // 8 pixel loop.
- convertloop8:
- movq xmm0, qword ptr [esi] // alpha
- punpcklbw xmm0, xmm0
- pxor xmm0, xmm5 // a, 255-a
- movq xmm1, qword ptr [eax + esi] // src0
- movq xmm2, qword ptr [edx + esi] // src1
- punpcklbw xmm1, xmm2
- psubb xmm1, xmm6 // bias src0/1 - 128
- pmaddubsw xmm0, xmm1
- paddw xmm0, xmm7 // unbias result - 32768 and round.
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- movq qword ptr [edi + esi], xmm0
- lea esi, [esi + 8]
- sub ecx, 8
- jg convertloop8
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
- pop edi
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop1b:
pop esi
ret
}
}
-#endif // HAS_BLENDPLANEROW_SSSE3
+#endif // HAS_ARGBBLENDROW_SSE2
-#ifdef HAS_BLENDPLANEROW_AVX2
-// Blend 32 pixels at a time.
-// unsigned version of math
-// =((A2*C2)+(B2*(255-C2))+255)/256
-// signed version of math
-// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) {
- __asm {
- push esi
- push edi
- vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
- vpsllw ymm5, ymm5, 8
- mov eax, 0x80808080 // 128 for biasing image to signed.
- vmovd xmm6, eax
- vbroadcastss ymm6, xmm6
- mov eax, 0x807f807f // 32768 + 127 for unbias and round.
- vmovd xmm7, eax
- vbroadcastss ymm7, xmm7
- mov eax, [esp + 8 + 4] // src0
- mov edx, [esp + 8 + 8] // src1
- mov esi, [esp + 8 + 12] // alpha
- mov edi, [esp + 8 + 16] // dst
- mov ecx, [esp + 8 + 20] // width
- sub eax, esi
- sub edx, esi
- sub edi, esi
-
- // 32 pixel loop.
- convertloop32:
- vmovdqu ymm0, [esi] // alpha
- vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
- vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
- vpxor ymm3, ymm3, ymm5 // a, 255-a
- vpxor ymm0, ymm0, ymm5 // a, 255-a
- vmovdqu ymm1, [eax + esi] // src0
- vmovdqu ymm2, [edx + esi] // src1
- vpunpckhbw ymm4, ymm1, ymm2
- vpunpcklbw ymm1, ymm1, ymm2
- vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
- vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
- vpmaddubsw ymm3, ymm3, ymm4
- vpmaddubsw ymm0, ymm0, ymm1
- vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
- vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
- vpsrlw ymm3, ymm3, 8
- vpsrlw ymm0, ymm0, 8
- vpackuswb ymm0, ymm0, ymm3
- vmovdqu [edi + esi], ymm0
- lea esi, [esi + 32]
- sub ecx, 32
- jg convertloop32
-
- pop edi
- pop esi
- vzeroupper
- ret
- }
-}
-#endif // HAS_BLENDPLANEROW_AVX2
-
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
static const uvec8 kShuffleAlpha = {
@@ -4226,8 +4098,14 @@
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
};
-
+// Same as SSE2, but replaces:
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+// pshuflw xmm3, xmm3, 0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
// Blend 8 pixels at a time.
+
__declspec(naked)
void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
@@ -4255,7 +4133,7 @@
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movdqu xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
@@ -4284,7 +4162,7 @@
movdqa xmm0, xmm3 // src argb
pxor xmm3, xmm4 // ~alpha
movd xmm2, [esi] // _r_b
- pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pshufb xmm3, kShuffleAlpha // alpha
pand xmm2, xmm6 // _r_b
paddw xmm3, xmm7 // 256 - alpha
pmullw xmm2, xmm3 // _r_b * alpha
@@ -4309,6 +4187,48 @@
}
#endif // HAS_ARGBBLENDROW_SSSE3
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
+ psrld xmm5, 8
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm0 // first 2
+ pshufhw xmm2, xmm0, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm0, xmm2 // rgb * a
+ movdqu xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm1 // next 2 pixels
+ pshufhw xmm2, xmm1, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqu xmm2, [eax] // alphas
+ lea eax, [eax + 16]
+ psrlw xmm0, 8
+ pand xmm2, xmm4
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ pand xmm0, xmm5 // keep original alphas
+ por xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSE2
+
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {
@@ -4326,8 +4246,8 @@
mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24
- movdqa xmm4, xmmword ptr kShuffleAlpha0
- movdqa xmm5, xmmword ptr kShuffleAlpha1
+ movdqa xmm4, kShuffleAlpha0
+ movdqa xmm5, kShuffleAlpha1
convertloop:
movdqu xmm0, [eax] // read 4 pixels
@@ -4369,7 +4289,7 @@
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+ vbroadcastf128 ymm4,kShuffleAlpha_AVX2
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
@@ -4403,13 +4323,11 @@
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
__asm {
- push ebx
push esi
push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- lea ebx, fixed_invtbl8
+ mov eax, [esp + 8 + 4] // src_argb0
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov ecx, [esp + 8 + 12] // width
convertloop:
movdqu xmm0, [eax] // read 4 pixels
@@ -4416,8 +4334,8 @@
movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha
punpcklbw xmm0, xmm0 // first 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
@@ -4427,22 +4345,21 @@
movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha
punpckhbw xmm1, xmm1 // next 2
- movd xmm2, dword ptr [ebx + esi * 4]
- movd xmm3, dword ptr [ebx + edi * 4]
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
lea eax, [eax + 16]
+
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg convertloop
-
pop edi
pop esi
- pop ebx
ret
}
}
@@ -4464,7 +4381,7 @@
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
- vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+ vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
@@ -4495,37 +4412,36 @@
int width) {
__asm {
- push ebx
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
+
push esi
push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov edx, [esp + 12 + 8] // dst_argb
- mov ecx, [esp + 12 + 12] // width
- sub edx, eax
- lea ebx, fixed_invtbl8
- vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
convertloop:
// replace VPGATHER
movzx esi, byte ptr [eax + 3] // alpha0
movzx edi, byte ptr [eax + 7] // alpha1
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
movzx esi, byte ptr [eax + 11] // alpha2
movzx edi, byte ptr [eax + 15] // alpha3
vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
- movzx esi, byte ptr [eax + 19] // alpha4
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
movzx edi, byte ptr [eax + 23] // alpha5
vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
- vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
- vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
movzx esi, byte ptr [eax + 27] // alpha6
movzx edi, byte ptr [eax + 31] // alpha7
vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
- vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
- vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
@@ -4549,7 +4465,6 @@
pop edi
pop esi
- pop ebx
vzeroupper
ret
}
@@ -4565,8 +4480,8 @@
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* width */
- movdqa xmm4, xmmword ptr kARGBToYJ
- movdqa xmm5, xmmword ptr kAddYJ64
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
convertloop:
movdqu xmm0, [eax] // G
@@ -4623,9 +4538,9 @@
__asm {
mov eax, [esp + 4] /* dst_argb */
mov ecx, [esp + 8] /* width */
- movdqa xmm2, xmmword ptr kARGBToSepiaB
- movdqa xmm3, xmmword ptr kARGBToSepiaG
- movdqa xmm4, xmmword ptr kARGBToSepiaR
+ movdqa xmm2, kARGBToSepiaB
+ movdqa xmm3, kARGBToSepiaG
+ movdqa xmm4, kARGBToSepiaR
convertloop:
movdqu xmm0, [eax] // B
@@ -5275,7 +5190,6 @@
// dst points to pixel to store result to.
// count is number of averaged pixels to produce.
// Does 4 pixels at a time.
-// This function requires alignment on accumulation buffer pointers.
void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst,
int count) {
@@ -5603,38 +5517,36 @@
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ shr eax, 1
// Dispatch to specialized filters if applicable.
cmp eax, 0
- je xloop100 // 0 / 256. Blend 100 / 0.
+ je xloop100 // 0 / 128. Blend 100 / 0.
sub edi, esi
- cmp eax, 128
- je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
- vmovd xmm0, eax // high fraction 0..255
+ vmovd xmm0, eax // high fraction 0..127
neg eax
- add eax, 256
- vmovd xmm5, eax // low fraction 256..1
+ add eax, 128
+ vmovd xmm5, eax // low fraction 128..1
vpunpcklbw xmm5, xmm5, xmm0
vpunpcklwd xmm5, xmm5, xmm5
- vbroadcastss ymm5, xmm5
+ vpxor ymm0, ymm0, ymm0
+ vpermd ymm5, ymm0, ymm5
- mov eax, 0x80808080 // 128b for bias and rounding.
- vmovd xmm4, eax
- vbroadcastss ymm4, xmm4
-
xloop:
vmovdqu ymm0, [esi]
vmovdqu ymm2, [esi + edx]
vpunpckhbw ymm1, ymm0, ymm2 // mutates
- vpunpcklbw ymm0, ymm0, ymm2
- vpsubb ymm1, ymm1, ymm4 // bias to signed image
- vpsubb ymm0, ymm0, ymm4
- vpmaddubsw ymm1, ymm5, ymm1
- vpmaddubsw ymm0, ymm5, ymm0
- vpaddw ymm1, ymm1, ymm4 // unbias and round
- vpaddw ymm0, ymm0, ymm4
- vpsrlw ymm1, ymm1, 8
- vpsrlw ymm0, ymm0, 8
+ vpunpcklbw ymm0, ymm0, ymm2 // mutates
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddubsw ymm1, ymm1, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm1, ymm1, 7
vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
@@ -5642,6 +5554,18 @@
jg xloop
jmp xloop99
+ // Blend 25 / 75.
+ xloop25:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm1, [esi + edx]
+ vpavgb ymm0, ymm0, ymm1
+ vpavgb ymm0, ymm0, ymm1
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop25
+ jmp xloop99
+
// Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
@@ -5652,6 +5576,18 @@
jg xloop50
jmp xloop99
+ // Blend 75 / 25.
+ xloop75:
+ vmovdqu ymm1, [esi]
+ vmovdqu ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, ymm1
+ vpavgb ymm0, ymm0, ymm1
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop75
+ jmp xloop99
+
// Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@@ -5666,7 +5602,6 @@
#endif // HAS_INTERPOLATEROW_AVX2
// Bilinear filter 16x2 -> 16x1
-// TODO(fbarchard): Consider allowing 256 using memcpy.
__declspec(naked)
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
@@ -5674,7 +5609,6 @@
__asm {
push esi
push edi
-
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
@@ -5681,22 +5615,24 @@
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
+ shr eax, 1
// Dispatch to specialized filters if applicable.
cmp eax, 0
- je xloop100 // 0 /256. Blend 100 / 0.
- cmp eax, 128
- je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
- movd xmm0, eax // high fraction 0..255
+ movd xmm0, eax // high fraction 0..127
neg eax
- add eax, 256
- movd xmm5, eax // low fraction 255..1
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
- mov eax, 0x80808080 // 128 for biasing image to signed.
- movd xmm4, eax
- pshufd xmm4, xmm4, 0x00
xloop:
movdqu xmm0, [esi]
@@ -5704,23 +5640,29 @@
movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
- psubb xmm0, xmm4 // bias image by -128
- psubb xmm1, xmm4
- movdqa xmm2, xmm5
- movdqa xmm3, xmm5
- pmaddubsw xmm2, xmm0
- pmaddubsw xmm3, xmm1
- paddw xmm2, xmm4
- paddw xmm3, xmm4
- psrlw xmm2, 8
- psrlw xmm3, 8
- packuswb xmm2, xmm3
- movdqu [esi + edi], xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop
jmp xloop99
+ // Blend 25 / 75.
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop25
+ jmp xloop99
+
// Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
@@ -5732,6 +5674,18 @@
jg xloop50
jmp xloop99
+ // Blend 75 / 25.
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop75
+ jmp xloop99
+
// Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
@@ -5747,16 +5701,124 @@
}
}
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ cmp eax, 64
+ je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ cmp eax, 192
+ je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ psrlw xmm5, 1
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ punpcklqdq xmm5, xmm5
+ pxor xmm4, xmm4
+
+ xloop:
+ movdqu xmm0, [esi] // row0
+ movdqu xmm2, [esi + edx] // row1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
+ paddw xmm3, xmm3
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ movdqu xmm0, [esi]
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
__declspec(naked)
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqu xmm5, [ecx]
- mov ecx, [esp + 16] // width
+ mov ecx, [esp + 16] // pix
wloop:
movdqu xmm0, [eax]
@@ -5776,13 +5838,13 @@
#ifdef HAS_ARGBSHUFFLEROW_AVX2
__declspec(naked)
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
- mov ecx, [esp + 16] // width
+ mov ecx, [esp + 16] // pix
wloop:
vmovdqu ymm0, [eax]
@@ -5804,7 +5866,7 @@
__declspec(naked)
void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
__asm {
push ebx
push esi
@@ -5811,7 +5873,7 @@
mov eax, [esp + 8 + 4] // src_argb
mov edx, [esp + 8 + 8] // dst_argb
mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // width
+ mov ecx, [esp + 8 + 16] // pix
pxor xmm5, xmm5
mov ebx, [esi] // shuffler
@@ -6183,7 +6245,7 @@
// 4 pixel loop.
convertloop:
- movdqu xmm0, xmmword ptr [eax] // generate luma ptr
+ movdqu xmm0, qword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3
phaddw xmm0, xmm0
pand xmm0, xmm4 // mask out low bits
--- /dev/null
+++ b/third_party/libyuv/source/row_x86.asm
@@ -1,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+ pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
+ psrlw m2, m2, 8
+%endif
+
+ ALIGN 4
+.convertloop:
+ mov%2 m0, [src_yuy2q]
+ mov%2 m1, [src_yuy2q + mmsize]
+ lea src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+ pand m0, m0, m2 ; YUY2 even bytes are Y
+ pand m1, m1, m2
+%else
+ psrlw m0, m0, 8 ; UYVY odd bytes are Y
+ psrlw m1, m1, 8
+%endif
+ packuswb m0, m0, m1
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+%endif
+ sub pixd, mmsize
+ mov%2 [dst_yq], m0
+ lea dst_yq, [dst_yq + mmsize]
+ jg .convertloop
+ REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+ pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
+ psrlw m4, m4, 8
+ sub dst_vq, dst_uq
+
+ ALIGN 4
+.convertloop:
+ mov%1 m0, [src_uvq]
+ mov%1 m1, [src_uvq + mmsize]
+ lea src_uvq, [src_uvq + mmsize * 2]
+ psrlw m2, m0, 8 ; odd bytes
+ psrlw m3, m1, 8
+ pand m0, m0, m4 ; even bytes
+ pand m1, m1, m4
+ packuswb m0, m0, m1
+ packuswb m2, m2, m3
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+ vpermq m2, m2, 0xd8
+%endif
+ mov%1 [dst_uq], m0
+ mov%1 [dst_uq + dst_vq], m2
+ lea dst_uq, [dst_uq + mmsize]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+; int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+ sub src_vq, src_uq
+
+ ALIGN 4
+.convertloop:
+ mov%1 m0, [src_uq]
+ mov%1 m1, [src_vq]
+ lea src_uq, [src_uq + mmsize]
+ punpcklbw m2, m0, m1 // first 8 UV pairs
+ punpckhbw m0, m0, m1 // next 8 UV pairs
+%if cpuflag(AVX2)
+ vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
+ vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
+ mov%1 [dst_uvq], m1
+ mov%1 [dst_uvq + mmsize], m2
+%else
+ mov%1 [dst_uvq], m2
+ mov%1 [dst_uvq + mmsize], m0
+%endif
+ lea dst_uvq, [dst_uvq + mmsize * 2]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
--- a/third_party/libyuv/source/scale.cc
+++ b/third_party/libyuv/source/scale.cc
@@ -61,15 +61,15 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
- ScaleRowDown2Box_Any_SSSE3);
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
+ ScaleRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
- (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
- ScaleRowDown2Box_SSSE3);
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+ ScaleRowDown2Box_SSE2);
}
}
#endif
@@ -85,12 +85,12 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+ ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
}
#endif
@@ -135,12 +135,12 @@
ScaleRowDown2Box_16_SSE2);
}
#endif
-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ?
- ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+ ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
}
#endif
@@ -182,12 +182,12 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+ ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
if (IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
}
#endif
@@ -200,12 +200,12 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+ ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
}
#endif
@@ -245,12 +245,12 @@
ScaleRowDown4_16_SSE2;
}
#endif
-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
- ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+ ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
}
#endif
@@ -325,16 +325,16 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
+ ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
}
}
#endif
@@ -404,16 +404,16 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
+ ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
} else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
}
}
#endif
@@ -517,16 +517,16 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN38_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+ ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
} else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
}
}
#endif
@@ -595,16 +595,16 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
+ ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
} else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
}
}
#endif
@@ -659,6 +659,7 @@
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
+ int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -666,8 +667,7 @@
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
- scaletbl[boxwidth - minboxwidth] >> 16;
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
@@ -676,6 +676,7 @@
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
+ int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -683,8 +684,8 @@
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
- *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
- scaletbl[boxwidth - minboxwidth] >> 16;
+ *dst_ptr++ =
+ SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
@@ -874,6 +875,14 @@
&x, &y, &dx, &dy);
src_width = Abs(src_width);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -898,11 +907,11 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1002,11 +1011,11 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif
@@ -1063,6 +1072,14 @@
&x, &y, &dx, &dy);
src_width = Abs(src_width);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -1087,11 +1104,11 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1226,11 +1243,11 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif
--- a/third_party/libyuv/source/scale_any.cc
+++ b/third_party/libyuv/source/scale_any.cc
@@ -55,29 +55,12 @@
dst_ptr + n * BPP, r); \
}
-// Fixed scale down for odd source width. Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
- uint8* dst_ptr, int dst_width) { \
- int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
- }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
+#ifdef HAS_SCALEROWDOWN2_SSE2
+SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
+SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
- ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
@@ -85,8 +68,6 @@
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
- 2, 1, 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
@@ -94,12 +75,10 @@
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
- ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
+#ifdef HAS_SCALEROWDOWN4_SSE2
+SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
--- a/third_party/libyuv/source/scale_argb.cc
+++ b/third_party/libyuv/source/scale_argb.cc
@@ -210,6 +210,14 @@
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
src_argb += xl * 4;
x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -234,12 +242,12 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -300,6 +308,14 @@
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -324,10 +340,10 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
if (src_width >= 32768) {
@@ -465,13 +481,13 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
@@ -478,6 +494,14 @@
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -502,10 +526,10 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
@@ -821,36 +845,6 @@
dst_argb, dst_stride_argb, dst_width, dst_height,
0, 0, dst_width, dst_height, filtering);
return 0;
-}
-
-// Scale with YUV conversion to ARGB and clipping.
-LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- uint32 src_fourcc,
- int src_width, int src_height,
- uint8* dst_argb, int dst_stride_argb,
- uint32 dst_fourcc,
- int dst_width, int dst_height,
- int clip_x, int clip_y, int clip_width, int clip_height,
- enum FilterMode filtering) {
- uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
- int r;
- I420ToARGB(src_y, src_stride_y,
- src_u, src_stride_u,
- src_v, src_stride_v,
- argb_buffer, src_width * 4,
- src_width, src_height);
-
- r = ARGBScaleClip(argb_buffer, src_width * 4,
- src_width, src_height,
- dst_argb, dst_stride_argb,
- dst_width, dst_height,
- clip_x, clip_y, clip_width, clip_height,
- filtering);
- free(argb_buffer);
- return r;
}
#ifdef __cplusplus
--- a/third_party/libyuv/source/scale_common.cc
+++ b/third_party/libyuv/source/scale_common.cc
@@ -103,28 +103,6 @@
}
}
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- int x;
- dst_width -= 1;
- for (x = 0; x < dst_width - 1; x += 2) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
- dst += 2;
- s += 4;
- t += 4;
- }
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst += 1;
- s += 2;
- t += 2;
- }
- dst[0] = (s[0] + t[0] + 1) >> 1;
-}
-
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
const uint16* s = src_ptr;
@@ -417,14 +395,8 @@
}
// (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) (uint8)((int)(a) + \
- ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-#else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
- (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
-#endif
+ ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
@@ -476,9 +448,8 @@
}
#undef BLENDER
-// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \
- ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+ ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
@@ -816,7 +787,6 @@
}
}
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
@@ -906,6 +876,14 @@
assert(dst_width > 0);
assert(dst_height > 0);
src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -930,13 +908,13 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1004,13 +982,13 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) &&
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif
--- a/third_party/libyuv/source/scale_gcc.cc
+++ b/third_party/libyuv/source/scale_gcc.cc
@@ -9,7 +9,6 @@
*/
#include "libyuv/row.h"
-#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -17,8 +16,7 @@
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@@ -98,8 +96,8 @@
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
@@ -120,13 +118,11 @@
);
}
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
@@ -133,11 +129,15 @@
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
@@ -145,17 +145,15 @@
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+ :: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
@@ -164,17 +162,17 @@
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
@@ -188,107 +186,9 @@
);
}
-#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
- );
-}
-
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
-}
-#endif // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n"
@@ -314,15 +214,12 @@
);
}
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- intptr_t stridex3;
+ intptr_t stridex3 = 0;
asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
LABELALIGN
@@ -331,29 +228,31 @@
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
+ MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
"lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
@@ -361,100 +260,13 @@
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
- "=&r"(stridex3) // %3
+ "+r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
);
}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm5"
- );
-}
-
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(src_stride * 3)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
-}
-#endif // HAS_SCALEROWDOWN4_AVX2
-
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
@@ -762,79 +574,54 @@
}
// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int tmp_height = 0;
+ intptr_t tmp_src = 0;
asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
+ "mov %0,%3 \n" // row pointer
+ "mov %5,%2 \n" // height
+ "pxor %%xmm0,%%xmm0 \n" // clear accumulators
+ "pxor %%xmm1,%%xmm1 \n"
+ "pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
+ "movdqu " MEMACCESS(3) ",%%xmm2 \n"
+ "add %6,%3 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
+ "sub $0x1,%2 \n"
+ "jg 1b \n"
+
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x10,%2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
+ "mov %0,%3 \n" // row pointer
+ "mov %5,%2 \n" // height
+ "pxor %%xmm0,%%xmm0 \n" // clear accumulators
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ "+r"(tmp_height), // %2
+ "+r"(tmp_src), // %3
+ "+r"(src_width), // %4
+ "+rm"(src_height) // %5
+ : "rm"((intptr_t)(src_stride)) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
-
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
- asm volatile (
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
- "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
-}
-#endif // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static uvec8 kFsub80 =
- { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
- { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
-
// Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
- intptr_t x0, x1, temp_pixel;
+ intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
asm volatile (
"movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n"
@@ -841,10 +628,7 @@
"movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
-
+ "psrlw $0x9,%%xmm6 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n"
"jl 29f \n"
@@ -866,19 +650,16 @@
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
"mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n"
- "subl $0x2,%5 \n"
+ "sub $0x2,%5 \n"
"jge 2b \n"
LABELALIGN
@@ -889,37 +670,23 @@
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
"mov %b2," MEMACCESS(0) " \n"
"99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "=&a"(temp_pixel), // %2
- "=&r"(x0), // %3
- "=&r"(x1), // %4
-#if defined(__x86_64__)
- "+rm"(dst_width) // %5
-#else
- "+m"(dst_width) // %5
-#endif
- : "rm"(x), // %6
- "rm"(dx), // %7
-#if defined(__x86_64__)
- "x"(kFsub80), // %8
- "x"(kFadd40) // %9
-#else
- "m"(kFsub80), // %8
- "m"(kFadd40) // %9
-#endif
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+a"(temp_pixel), // %2
+ "+r"(x0), // %3
+ "+r"(x1), // %4
+ "+rm"(dst_width) // %5
+ : "rm"(x), // %6
+ "rm"(dx) // %7
: "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -1028,7 +795,7 @@
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12;
+ intptr_t src_stepx_x12 = 0;
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
@@ -1046,11 +813,11 @@
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width), // %3
- "=&r"(src_stepx_x12) // %4
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "+r"(src_stepx_x12) // %4
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
@@ -1062,7 +829,7 @@
ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
- intptr_t src_stepx_x12;
+ intptr_t src_stepx_x12 = 0;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
@@ -1091,12 +858,12 @@
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+rm"(dst_width), // %3
- "=&r"(src_stepx_x12), // %4
- "+r"(row1) // %5
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "+r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
@@ -1104,7 +871,7 @@
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
- intptr_t x0, x1;
+ intptr_t x0 = 0, x1 = 0;
asm volatile (
"movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
@@ -1157,8 +924,8 @@
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
"movd %%xmm0," MEMACCESS(2) " \n"
"99: \n"
- : "=&a"(x0), // %0
- "=&d"(x1), // %1
+ : "+a"(x0), // %0
+ "+d"(x1), // %1
"+r"(dst_argb), // %2
"+r"(src_argb), // %3
"+r"(dst_width) // %4
@@ -1209,7 +976,7 @@
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
- intptr_t x0, x1;
+ intptr_t x0 = 0, x1 = 0;
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
@@ -1272,8 +1039,8 @@
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+rm"(dst_width), // %2
- "=&r"(x0), // %3
- "=&r"(x1) // %4
+ "+r"(x0), // %3
+ "+r"(x1) // %4
: "rm"(x), // %5
"rm"(dx) // %6
: "memory", "cc", NACL_R14
--- a/third_party/libyuv/source/scale_mips.cc
+++ b/third_party/libyuv/source/scale_mips.cc
@@ -21,8 +21,8 @@
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
@@ -31,6 +31,7 @@
"beqz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -77,8 +78,8 @@
);
}
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
@@ -89,6 +90,7 @@
"bltz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -176,8 +178,8 @@
);
}
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -186,6 +188,7 @@
"beqz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -231,8 +234,8 @@
);
}
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
@@ -245,6 +248,7 @@
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
@@ -310,11 +314,12 @@
);
}
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -356,13 +361,14 @@
);
}
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -412,13 +418,14 @@
);
}
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -464,12 +471,13 @@
);
}
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -510,8 +518,8 @@
);
}
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
@@ -520,6 +528,7 @@
".set push \n"
".set noreorder \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
@@ -563,9 +572,9 @@
);
}
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
@@ -577,6 +586,7 @@
".set push \n"
".set noreorder \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
--- a/third_party/libyuv/source/scale_neon.cc
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -26,6 +26,7 @@
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
@@ -46,6 +47,7 @@
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
@@ -71,6 +73,7 @@
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
@@ -98,6 +101,7 @@
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -119,6 +123,7 @@
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
@@ -157,6 +162,7 @@
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -179,6 +185,7 @@
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -238,6 +245,7 @@
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -292,6 +300,7 @@
asm volatile (
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@@ -325,6 +334,7 @@
MEMACCESS(7)
"vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -440,6 +450,7 @@
MEMACCESS(5)
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -532,8 +543,9 @@
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
- const uint8* src_tmp;
+ const uint8* src_tmp = NULL;
asm volatile (
+ ".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
@@ -552,12 +564,12 @@
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
+ : "+r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
:
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
);
@@ -572,10 +584,6 @@
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
-// The NEON version mimics this formula:
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
-// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
@@ -582,6 +590,7 @@
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
asm volatile (
+ ".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
@@ -612,8 +621,8 @@
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
+ "vshrn.s32 d18, q11, #16 \n"
+ "vshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
@@ -740,6 +749,7 @@
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
@@ -763,6 +773,7 @@
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -793,6 +804,7 @@
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -833,6 +845,7 @@
int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n"
@@ -862,6 +875,7 @@
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
+ ".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
@@ -913,9 +927,10 @@
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
- int tmp;
+ int tmp = 0;
const uint8* src_tmp = src_argb;
asm volatile (
+ ".p2align 2 \n"
"1: \n"
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
@@ -930,13 +945,13 @@
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x), // %3
- "+r"(dx), // %4
- "=&r"(tmp), // %5
- "+r"(src_tmp) // %6
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
);
@@ -959,6 +974,7 @@
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
+ ".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
--- a/third_party/libyuv/source/scale_neon64.cc
+++ b/third_party/libyuv/source/scale_neon64.cc
@@ -547,7 +547,7 @@
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
- const uint8* src_tmp;
+ const uint8* src_tmp = NULL;
asm volatile (
"1: \n"
"mov %0, %1 \n"
@@ -567,12 +567,12 @@
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
+ : "+r"(src_tmp), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_width), // %4
+ "+r"(src_height) // %5
:
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
);
@@ -626,8 +626,8 @@
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
+ "shrn v6.4h, v16.4s, #16 \n"
+ "shrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
@@ -931,7 +931,7 @@
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
- int64 tmp64;
+ int64 tmp64 = 0;
asm volatile (
"1: \n"
LOAD1_DATA32_LANE(v0, 0)
@@ -947,13 +947,13 @@
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width64), // %2
- "+r"(x64), // %3
- "+r"(dx64), // %4
- "=&r"(tmp64), // %5
- "+r"(src_tmp) // %6
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width64), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp64), // %5
+ "+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1"
);
--- a/third_party/libyuv/source/scale_win.cc
+++ b/third_party/libyuv/source/scale_win.cc
@@ -16,8 +16,9 @@
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+ defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@@ -95,8 +96,8 @@
// Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
@@ -121,28 +122,31 @@
// Blends 32x1 rectangle to 16x1.
__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
-
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pavgw xmm0, xmm5 // (x + 1) / 2
- pavgw xmm1, xmm5
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
packuswb xmm0, xmm1
+
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -154,8 +158,8 @@
// Blends 32x2 rectangle to 16x1.
__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
@@ -162,12 +166,9 @@
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- packuswb xmm4, xmm4
- pxor xmm5, xmm5 // constant 0
-
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -174,17 +175,19 @@
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add
- paddw xmm1, xmm3
- psrlw xmm0, 1
- psrlw xmm1, 1
- pavgw xmm0, xmm5 // (x + 1) / 2
- pavgw xmm1, xmm5
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
packuswb xmm0, xmm1
+
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -243,12 +246,14 @@
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+
+ vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -259,8 +264,6 @@
}
}
-// For rounding, average = (sum + 2) / 4
-// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -278,23 +281,19 @@
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
- vmovdqu ymm0, [eax]
+ vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + esi]
- vmovdqu ymm3, [eax + esi + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+
+ vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add
- vpaddw ymm1, ymm1, ymm3
- vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
- vpsrlw ymm1, ymm1, 1
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -309,7 +308,7 @@
// Point samples 32 pixels to 8 pixels.
__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -340,7 +339,7 @@
// Blends 32x4 rectangle to 8x1.
__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
@@ -350,11 +349,8 @@
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm4, xmm4 // constant 0x0101
- psrlw xmm4, 15
- movdqa xmm5, xmm4
- packuswb xmm4, xmm4
- psllw xmm5, 3 // constant 0x0008
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
+ psrlw xmm7, 8
wloop:
movdqu xmm0, [eax] // average rows
@@ -361,29 +357,34 @@
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
- pmaddubsw xmm0, xmm4 // horizontal add
- pmaddubsw xmm1, xmm4
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // vertical add rows 0, 1
- paddw xmm1, xmm3
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 2
- paddw xmm1, xmm3
- movdqu xmm2, [eax + edi]
- movdqu xmm3, [eax + edi + 16]
+ movdqu xmm4, [eax + edi]
+ movdqu xmm5, [eax + edi + 16]
lea eax, [eax + 32]
- pmaddubsw xmm2, xmm4
- pmaddubsw xmm3, xmm4
- paddw xmm0, xmm2 // add row 3
- paddw xmm1, xmm3
- phaddw xmm0, xmm1
- paddw xmm0, xmm5 // + 8 for round
- psrlw xmm0, 4 // /16 for average of 4 * 4
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm7
+ pand xmm3, xmm7
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
+ psrlw xmm0, 8
+ pand xmm2, xmm7
+ pavgw xmm0, xmm2
packuswb xmm0, xmm0
+
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
@@ -442,41 +443,37 @@
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
- vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
- vpsrlw ymm4, ymm4, 15
- vpsllw ymm5, ymm4, 3 // constant 0x0008
- vpackuswb ymm4, ymm4, ymm4
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
+ vpsrlw ymm7, ymm7, 8
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
- vmovdqu ymm2, [eax + esi]
- vmovdqu ymm3, [eax + esi + 32]
- vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
- vpmaddubsw ymm1, ymm1, ymm4
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
- vpaddw ymm1, ymm1, ymm3
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 2
- vpaddw ymm1, ymm1, ymm3
- vmovdqu ymm2, [eax + edi]
- vmovdqu ymm3, [eax + edi + 32]
- lea eax, [eax + 64]
- vpmaddubsw ymm2, ymm2, ymm4
- vpmaddubsw ymm3, ymm3, ymm4
- vpaddw ymm0, ymm0, ymm2 // add row 3
- vpaddw ymm1, ymm1, ymm3
- vphaddw ymm0, ymm0, ymm1 // mutates
- vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
- vpaddw ymm0, ymm0, ymm5 // + 8 for round
- vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
+ vpavgb ymm2, ymm2, [eax + edi]
+ vpavgb ymm3, ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpavgb ymm0, ymm0, ymm2
+ vpavgb ymm1, ymm1, ymm3
+
+ vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
+ vpand ymm3, ymm1, ymm7
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpavgw ymm0, ymm0, ymm2
+ vpavgw ymm1, ymm1, ymm3
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
+ vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
+ vpsrlw ymm0, ymm0, 8
+ vpavgw ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -502,9 +499,9 @@
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
- movdqa xmm3, xmmword ptr kShuf0
- movdqa xmm4, xmmword ptr kShuf1
- movdqa xmm5, xmmword ptr kShuf2
+ movdqa xmm3, kShuf0
+ movdqa xmm4, kShuf1
+ movdqa xmm5, kShuf2
wloop:
movdqu xmm0, [eax]
@@ -551,12 +548,12 @@
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShuf01
- movdqa xmm3, xmmword ptr kShuf11
- movdqa xmm4, xmmword ptr kShuf21
- movdqa xmm5, xmmword ptr kMadd01
- movdqa xmm6, xmmword ptr kMadd11
- movdqa xmm7, xmmword ptr kRound34
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
wloop:
movdqu xmm0, [eax] // pixels 0..7
@@ -582,7 +579,7 @@
lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
- movdqa xmm1, xmmword ptr kMadd21
+ movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
@@ -608,12 +605,12 @@
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShuf01
- movdqa xmm3, xmmword ptr kShuf11
- movdqa xmm4, xmmword ptr kShuf21
- movdqa xmm5, xmmword ptr kMadd01
- movdqa xmm6, xmmword ptr kMadd11
- movdqa xmm7, xmmword ptr kRound34
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
wloop:
movdqu xmm0, [eax] // pixels 0..7
@@ -642,7 +639,7 @@
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm4
- movdqa xmm1, xmmword ptr kMadd21
+ movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
@@ -668,8 +665,8 @@
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
- movdqa xmm4, xmmword ptr kShuf38a
- movdqa xmm5, xmmword ptr kShuf38b
+ movdqa xmm4, kShuf38a
+ movdqa xmm5, kShuf38b
xloop:
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
@@ -701,9 +698,9 @@
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShufAc
- movdqa xmm3, xmmword ptr kShufAc3
- movdqa xmm4, xmmword ptr kScaleAc33
+ movdqa xmm2, kShufAc
+ movdqa xmm3, kShufAc3
+ movdqa xmm4, kScaleAc33
pxor xmm5, xmm5
xloop:
@@ -766,10 +763,10 @@
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, xmmword ptr kShufAb0
- movdqa xmm3, xmmword ptr kShufAb1
- movdqa xmm4, xmmword ptr kShufAb2
- movdqa xmm5, xmmword ptr kScaleAb2
+ movdqa xmm2, kShufAb0
+ movdqa xmm3, kShufAb1
+ movdqa xmm4, kShufAb2
+ movdqa xmm5, kScaleAb2
xloop:
movdqu xmm0, [eax] // average 2 rows into xmm0
@@ -860,16 +857,6 @@
}
#endif // HAS_SCALEADDROW_AVX2
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static uvec8 kFsub80 =
- { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
- { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
-
// Bilinear column filtering. SSSE3 version.
__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -887,8 +874,6 @@
movd xmm5, eax
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
- pcmpeqb xmm7, xmm7 // generate 0x0001
- psrlw xmm7, 15
pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
@@ -911,16 +896,13 @@
movd xmm4, ebx
pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4
- psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm1, xmm6 // 0..7f and 7f..0
- paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
+ pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm1, xmm1 // 8 bits, 2 pixels.
- movd ebx, xmm1
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits, 2 pixels.
+ movd ebx, xmm0
mov [edi], bx
lea edi, [edi + 2]
sub ecx, 2 // 2 pixels
@@ -927,6 +909,7 @@
jge xloop2
xloop29:
+
add ecx, 2 - 1
jl xloop99
@@ -935,14 +918,11 @@
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
pshufb xmm2, xmm5 // 0011
- psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm2, xmm6 // 0..7f and 7f..0
- paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
- pmaddubsw xmm2, xmm0 // 16 bit
- paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
- psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
- packuswb xmm2, xmm2 // 8 bits
- movd ebx, xmm2
+ pmaddubsw xmm0, xmm2 // 16 bit
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits
+ movd ebx, xmm0
mov [edi], bl
xloop99:
@@ -1253,8 +1233,8 @@
mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx
- movdqa xmm4, xmmword ptr kShuffleColARGB
- movdqa xmm5, xmmword ptr kShuffleFractions
+ movdqa xmm4, kShuffleColARGB
+ movdqa xmm5, kShuffleFractions
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
pextrw eax, xmm2, 1 // get x0 integer. preroll
--- a/third_party/libyuv/source/video_common.cc
+++ b/third_party/libyuv/source/video_common.cc
@@ -25,7 +25,6 @@
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
- {FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
--- /dev/null
+++ b/third_party/libyuv/source/x86inc.asm
@@ -1,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,macho64
+ SECTION .text align=%1
+ %elifidn __OUTPUT_FORMAT__,macho
+ SECTION .text align=%1
+ fakegot:
+ %elifidn __OUTPUT_FORMAT__,aout
+ section .text
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+ %ifidn __OUTPUT_FORMAT__,aout
+ SECTION .text
+ %else
+ SECTION .text align=%1
+ %endif
+%endmacro
+
+%if WIN64
+ %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+ %undef PIC
+%endif
+%ifdef PIC
+ default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rsp + stack_offset + %3]
+ %define r%1mp qword r %+ %1m
+ %else
+ %define r%1m [esp + stack_offset + %3]
+ %define r%1mp dword r %+ %1m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+%if ARCH_X86_64 == 0
+ %define r%1 e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+ pop %1
+ %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assert failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ %if mmsize == 8
+ %assign xmm_regs_used 0
+ %else
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
+ %if xmm_regs_used > 6
+ SUB rsp, (xmm_regs_used-6)*16+16
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+ %if xmm_regs_used > 6
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+ %endrep
+ add %1, (xmm_regs_used-6)*16+16
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
+ %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL rsp
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+ %if has_epilogue
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+ cglobal_internal %1 %+ SUFFIX
+%else
+ cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+ %ifndef cglobaled_%1
+ %xdefine %1 mangle(%1)
+ %xdefine %1.skip_prologue %1 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %1, 1
+ %endif
+ %xdefine current_function %1
+ %ifidn __OUTPUT_FORMAT__,elf
+ global %1:function hidden
+ %else
+ global %1
+ %endif
+ align function_align
+ %1:
+ RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %assign stack_offset 0
+ %if %0 > 1
+ PROLOGUE %2
+ %endif
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 2+
+ %xdefine %1 mangle(%1)
+ global %1
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX (1<<0)
+%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4 (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3 (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<22)
+%assign cpuflags_bmi1 (1<<23)
+%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
+
+%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+ %if %0 >= 1
+ %xdefine cpuname %1
+ %assign cpuflags cpuflags_%1
+ %if %0 >= 2
+ %xdefine cpuname %1_%2
+ %assign cpuflags cpuflags | cpuflags_%2
+ %endif
+ %xdefine SUFFIX _ %+ cpuname
+ %if cpuflag(AVX)
+ %assign AVX_enabled 1
+ %endif
+ %if mmsize == 16 && notcpuflag(SSE2)
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elifidn %1, SSE3
+ %define movu lddqu
+ %endif
+ %else
+ %xdefine SUFFIX
+ %undef cpuname
+ %undef cpuflags
+ %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define num_mmregs 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ %assign %%i 0
+ %rep 8
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %rep 8
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nmm, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nxmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign AVX_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova vmovaps
+ %define movu vmovups
+ %undef movh
+ %define movnta vmovntps
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, ymm %+ %%i
+ CAT_XDEFINE nymm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+ %xdefine tmp%2 m%2
+ %xdefine ntmp%2 nm%2
+ %rotate 2
+%endrep
+%rep %0/2
+ %xdefine m%1 tmp%2
+ %xdefine nm%1 ntmp%2
+ %undef tmp%2
+ %undef ntmp%2
+ %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+ %xdefine tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 tmp
+ CAT_XDEFINE n, m%1, %1
+ CAT_XDEFINE n, m%2, %2
+%else
+ ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+ ; Be careful using this mode in nested macros though, as in some cases there may be
+ ; other copies of m# that have already been dereferenced and don't get updated correctly.
+ %xdefine %%n1 n %+ %1
+ %xdefine %%n2 n %+ %2
+ %xdefine tmp m %+ %%n1
+ CAT_XDEFINE m, %%n1, m %+ %%n2
+ CAT_XDEFINE m, %%n2, tmp
+ CAT_XDEFINE n, m %+ %%n1, %%n1
+ CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+ %undef tmp
+ %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE %%f, %%i, m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %1
+ %ifndef cglobaled_%1
+ %ifdef cglobaled_%2
+ %xdefine %%i %2
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-AVX emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+ %ifid %6
+ %define %%sizeofreg sizeof%6
+ %elifid %5
+ %define %%sizeofreg sizeof%5
+ %else
+ %define %%sizeofreg mmsize
+ %endif
+ %if %%sizeofreg==32
+ %if %4>=3
+ v%1 %5, %6, %7
+ %else
+ v%1 %5, %6
+ %endif
+ %else
+ %if %%sizeofreg==8
+ %define %%regmov movq
+ %elif %2
+ %define %%regmov movaps
+ %else
+ %define %%regmov movdqa
+ %endif
+
+ %if %4>=3+%3
+ %ifnidn %5, %6
+ %if AVX_enabled && %%sizeofreg==16
+ v%1 %5, %6, %7
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+ %%regmov %5, %6
+ %1 %5, %7
+ %endif
+ %else
+ %1 %5, %7
+ %endif
+ %elif %4>=3
+ %1 %5, %6, %7
+ %else
+ %1 %5, %6
+ %endif
+ %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+ %assign %%swap 0
+ %if AVX_enabled
+ %ifnid %6
+ %assign %%swap 1
+ %endif
+ %elifnidn %5, %6
+ %ifnid %7
+ %assign %%swap 1
+ %endif
+ %endif
+ %if %%swap && %3 == 0 && %8 == 1
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+ %else
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+ %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+ %ifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf